From 43f2b07193cbcccd266734320ea9b948f5a01926 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Apr 2025 11:57:19 +0300 Subject: [PATCH 01/13] common : fix noreturn compile warning (#13151) ggml-ci --- common/arg.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 27401492122..75e8e0bd51a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -673,8 +673,12 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s return {}; } -std::pair> common_remote_get_content(const std::string &, const common_remote_params &) { - throw std::runtime_error("error: built without CURL, cannot download model from the internet"); +std::pair> common_remote_get_content(const std::string & url, const common_remote_params &) { + if (!url.empty()) { + throw std::runtime_error("error: built without CURL, cannot download model from the internet"); + } + + return {}; } #endif // LLAMA_USE_CURL From d0a417f3c7a5a22ef05b3b76d91dbe1d3362bf0c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 28 Apr 2025 12:10:18 +0300 Subject: [PATCH 02/13] readme : update hot topics (#13150) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0e7bd2d213..1785493c3e2 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Hot topics +- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9) - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated -- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427 -- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode +- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123 From a4c340f974f9b7ac0c1aae897aabaa54549a97e5 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 28 Apr 2025 15:03:25 +0530 Subject: [PATCH 03/13] SYCL: Add all missing unary kernels (#13074) * SYCL: Add all missing unary kernels ggml-ci * decouple kernel launch range from data size using strided loop * use ciel_div helper for num_blocks ggml-ci * clean auto imported header files --- ggml/src/ggml-sycl/common.hpp | 4 + ggml/src/ggml-sycl/element_wise.cpp | 169 ++++++++++++++++++++++++++++ ggml/src/ggml-sycl/element_wise.hpp | 5 + ggml/src/ggml-sycl/ggml-sycl.cpp | 13 +++ 4 files changed, 191 insertions(+) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 0ab0fb0aa39..c3d9d186456 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -493,5 +493,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor acc) { int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size); +constexpr size_t ceil_div(const size_t m, const size_t n) { + return (m + n - 1) / n; +} + bool gpu_has_xmx(sycl::device &dev); #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index fc25d98ddff..dcc6ec809a7 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -21,6 +21,27 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne, } } +template +static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { + for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { + dst[i] = x[i] > static_cast(0.f) ? static_cast(1.f) : ((x[i] < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); + } +} + +template +static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { + for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { + dst[i] = sycl::fabs(x[i]); + } +} + +template +static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { + for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { + dst[i] = (x[i] > static_cast(0.f)) ? x[i] : sycl::expm1(x[i]); + } +} + template static void gelu(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { @@ -335,6 +356,37 @@ static void silu_sycl(const T *x, T *dst, const int k, }); } +template +static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { + // hard code for now + const int num_blocks = ceil_div(k, 256); + stream->parallel_for( + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { + sgn(x, dst, k, item_ct1); + }); +} + +template +static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { + // hard code for now + const int num_blocks = ceil_div(k, 256); + stream->parallel_for( + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { + abs_op(x, dst, k, item_ct1); + }); +} + + +template +static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { + // hard code for now + const int num_blocks = ceil_div(k, 256); + stream->parallel_for( + sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) { + elu_op(x, dst, k, item_ct1); + }); +} + template static void gelu_quick_sycl(const T *x, T *dst, const int k, queue_ptr stream) { @@ -574,6 +626,106 @@ static void clamp_sycl(const T *x, T *dst, const float min, }); } +inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + break; + } +} + +inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + break; + } +} + + +inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + break; + } +} + inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); @@ -1388,3 +1540,20 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s done\n", __func__); } +void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); + ggml_sycl_op_sgn(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); + ggml_sycl_op_abs(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type)); + ggml_sycl_op_elu(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index e623cb56f76..f4199d69da6 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -66,5 +66,10 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 548f2d0a06b..66b6f2cca4d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -38,6 +38,7 @@ #include "ggml-sycl/backend.hpp" #include "ggml-sycl/common.hpp" +#include "ggml-sycl/element_wise.hpp" #include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" #include "ggml-sycl/sycl_hw.hpp" @@ -3355,6 +3356,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_UNARY_OP_EXP: ggml_sycl_exp(ctx, dst); break; + case GGML_UNARY_OP_SGN: + ggml_sycl_sgn(ctx, dst); + break; + case GGML_UNARY_OP_ABS: + ggml_sycl_abs(ctx, dst); + break; + case GGML_UNARY_OP_ELU: + ggml_sycl_elu(ctx, dst); + break; default: return false; } @@ -3837,6 +3847,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_SGN: + case GGML_UNARY_OP_ABS: + case GGML_UNARY_OP_ELU: #if defined (GGML_SYCL_F16) return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type); #else From 5fa9e63be82225fb3249c76f39ddda3e5bdec0a3 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 28 Apr 2025 12:18:59 +0200 Subject: [PATCH 04/13] clip : refactor set input for cgraph + fix qwen2.5vl input (#13136) * clip : refactor set input for cgraph * more strict assert * minicpmv : use clip_n_mmproj_embd instead of copying the same code everywhere * split qwen2 and qwen2.5 code blocks * minor style fix --- examples/llava/clip.cpp | 473 ++++++++++++++++++---------------------- 1 file changed, 215 insertions(+), 258 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 3cd27d5b17a..8c5d56cc17a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -170,8 +170,8 @@ struct clip_hparams { std::vector image_grid_pinpoints; int32_t image_crop_resolution; std::unordered_set vision_feature_layer; - int32_t attn_window_size; - int32_t n_wa_pattern; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; }; struct clip_layer { @@ -325,7 +325,6 @@ struct clip_ctx { float image_std[3]; bool use_gelu = false; bool use_silu = false; - int32_t ftype = 1; gguf_context_ptr ctx_gguf; ggml_context_ptr ctx_data; @@ -776,7 +775,6 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_ const int image_size_width = imgs.entries[0]->nx; const int image_size_height = imgs.entries[0]->ny; - const bool use_mrope = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL; const bool use_window_attn = hparams.n_wa_pattern > 0; const int n_wa_pattern = hparams.n_wa_pattern; @@ -785,10 +783,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_ const int patches_w = image_size_width / patch_size; const int patches_h = image_size_height / patch_size; const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int num_position_ids = use_mrope ? num_positions * 4 : num_positions; + const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; const float eps = hparams.eps; int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; @@ -870,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_ } // loop over layers - for (int il = 0; il < ctx->max_feature_layer; il++) { + for (int il = 0; il < n_layer; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states // rmsnorm1 @@ -1115,15 +1114,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; - if (ctx->minicpmv_version == 2) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 3) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } - else if (ctx->minicpmv_version == 4) { - pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1); - } + int n_output_dim = clip_n_mmproj_embd(ctx); + pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1); ggml_set_name(pos_embed, "pos_embed"); ggml_set_input(pos_embed); } @@ -1461,23 +1453,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } { // attention - int hidden_size = 4096; + int hidden_size = clip_n_mmproj_embd(ctx); const int d_head = 128; int n_head = hidden_size/d_head; int num_query = 96; if (ctx->minicpmv_version == 2) { - hidden_size = 4096; - n_head = hidden_size/d_head; num_query = 96; } else if (ctx->minicpmv_version == 3) { - hidden_size = 3584; - n_head = hidden_size/d_head; num_query = 64; } else if (ctx->minicpmv_version == 4) { - hidden_size = 3584; - n_head = hidden_size/d_head; num_query = 64; } @@ -1760,6 +1746,8 @@ struct clip_model_loader { LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector); LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); + LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); + LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } @@ -3038,15 +3026,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (model.class_embedding ? 1 : 0); - const int pos_w = ctx->load_image_size.width / patch_size; + const int pos_w = ctx->load_image_size.width / patch_size; const int pos_h = ctx->load_image_size.height / patch_size; const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl + auto get_inp_tensor = [&gf](const char * name) { + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name); + if (inp == nullptr) { + GGML_ABORT("Failed to get tensor %s", name); + } + if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ABORT("Tensor %s is not an input tensor", name); + } + return inp; + }; + + auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector & values) { + ggml_tensor * cur = get_inp_tensor(name); + GGML_ASSERT(cur->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size()); + ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur)); + }; + + // set input pixel values { - struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); - std::vector inp_data(ggml_nelements(inp_raw)); - float * data = inp_data.data(); + size_t nelem = 0; + for (const auto & img : imgs.entries) { + nelem += img->nx * img->ny * 3; + } + std::vector inp_raw(nelem); // layout of data (note: the channel dim is unrolled to better visualize the layout): // @@ -3065,7 +3081,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int n = nx * ny; for (int b = 0; b < batch_size; b++) { - float * batch_entry = data + b * (3*n); + float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { size_t base_src = 3*(y * nx + x); // idx of the first channel @@ -3077,266 +3093,207 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } } - ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); + set_input_f32("inp_raw", inp_raw); } - if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) { - { - // inspired from siglip: - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit - // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - std::vector pos_data(ggml_nelements(positions)); - int * data = pos_data.data(); - int bucket_coords_h[1024]; - int bucket_coords_w[1024]; - for (int i = 0; i < pos_h; i++){ - bucket_coords_h[i] = std::floor(70.0*i/pos_h); - } - for (int i = 0; i < pos_w; i++){ - bucket_coords_w[i] = std::floor(70.0*i/pos_w); - } - for (int i = 0, id = 0; i < pos_h; i++){ - for (int j = 0; j < pos_w; j++){ - data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + // set input per projector + switch (ctx->proj_type) { + case PROJECTOR_TYPE_MINICPMV: + { + // inspired from siglip: + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit + // -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316 + std::vector positions(pos_h * pos_w); + int bucket_coords_h[1024]; + int bucket_coords_w[1024]; + for (int i = 0; i < pos_h; i++){ + bucket_coords_h[i] = std::floor(70.0*i/pos_h); } - } - ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions)); - } + for (int i = 0; i < pos_w; i++){ + bucket_coords_w[i] = std::floor(70.0*i/pos_w); + } + for (int i = 0, id = 0; i < pos_h; i++){ + for (int j = 0; j < pos_w; j++){ + positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j]; + } + } + set_input_i32("positions", positions); - { - // inspired from resampler of Qwen-VL: - // -> https://huggingface.co/Qwen/Qwen-VL/tree/main - // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 - struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed"); - int embed_dim = 4096; - if (ctx->minicpmv_version == 2) { - embed_dim = 4096; - } - else if (ctx->minicpmv_version == 3) { - embed_dim = 3584; - } - else if (ctx->minicpmv_version == 4) { - embed_dim = 3584; - } - else { - GGML_ABORT("Unknown minicpmv version"); - } + // inspired from resampler of Qwen-VL: + // -> https://huggingface.co/Qwen/Qwen-VL/tree/main + // -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23 + int embed_dim = clip_n_mmproj_embd(ctx); - // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? - auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); + // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos? + auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h)); - std::vector pos_data(ggml_nelements(pos_embed)); - float * data = pos_data.data(); - for(int i = 0; i < pos_w * pos_h; ++i){ - for(int j = 0; j < embed_dim; ++j){ - data[i * embed_dim + j] = pos_embed_t[i][j]; + std::vector pos_embed(embed_dim * pos_w * pos_h); + for(int i = 0; i < pos_w * pos_h; ++i){ + for(int j = 0; j < embed_dim; ++j){ + pos_embed[i * embed_dim + j] = pos_embed_t[i][j]; + } } - } - ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed)); - } - } - else { - // non-minicpmv models - - if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) { - // pw * ph = number of tokens output by ViT after apply patch merger - // ipw * ipw = number of vision token been processed inside ViT - const int merge_ratio = 2; - const int pw = image_size_width / patch_size / merge_ratio; - const int ph = image_size_height / patch_size / merge_ratio; - const int ipw = image_size_width / patch_size; - const int iph = image_size_height / patch_size; - - std::vector idx (ph * pw); - std::vector inv_idx(ph * pw); - - if (use_window_attn) { - const int attn_window_size = 112; - struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx"); - struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx"); - struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask"); - - const int grid_window = attn_window_size / patch_size / merge_ratio; - int dst = 0; - // [num_vision_tokens, num_vision_tokens] attention mask tensor - std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); - int mask_row = 0; - - for (int y = 0; y < ph; y += grid_window) - { - for (int x = 0; x < pw; x += grid_window) - { - const int win_h = std::min(grid_window, ph - y); - const int win_w = std::min(grid_window, pw - x); - const int dst_0 = dst; - // group all tokens belong to the same window togather (to a continue range) - for (int dy = 0; dy < win_h; dy++) { - for (int dx = 0; dx < win_w; dx++) { - const int src = (y + dy) * pw + (x + dx); - assert(src < (int)idx.size()); - assert(dst < (int)inv_idx.size()); - idx [src] = dst; - inv_idx[dst] = src; - dst++; + set_input_f32("pos_embed", pos_embed); + } break; + case PROJECTOR_TYPE_QWEN2VL: + { + const int merge_ratio = 2; + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + std::vector positions(num_positions * 4); + int ptr = 0; + for (int y = 0; y < ph; y += merge_ratio) { + for (int x = 0; x < pw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + positions[ ptr] = y + dy; + positions[ num_patches + ptr] = x + dx; + positions[2 * num_patches + ptr] = y + dy; + positions[3 * num_patches + ptr] = x + dx; + ptr++; } } - - for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { - int row_offset = mask_row * (ipw * iph); - std::fill( - mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), - mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), - 0.0); - mask_row++; - } } } - ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx)); - ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx)); - ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask)); - } else { - std::iota(idx.begin(), idx.end(), 0); - std::iota(inv_idx.begin(), inv_idx.end(), 0); - } + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_QWEN25VL: + { + // pw * ph = number of tokens output by ViT after apply patch merger + // ipw * ipw = number of vision token been processed inside ViT + const int merge_ratio = 2; + const int pw = image_size_width / patch_size / merge_ratio; + const int ph = image_size_height / patch_size / merge_ratio; + const int ipw = image_size_width / patch_size; + const int iph = image_size_height / patch_size; + + std::vector idx (ph * pw); + std::vector inv_idx(ph * pw); + + if (use_window_attn) { + const int attn_window_size = 112; + const int grid_window = attn_window_size / patch_size / merge_ratio; + int dst = 0; + // [num_vision_tokens, num_vision_tokens] attention mask tensor + std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); + int mask_row = 0; + + for (int y = 0; y < ph; y += grid_window) { + for (int x = 0; x < pw; x += grid_window) { + const int win_h = std::min(grid_window, ph - y); + const int win_w = std::min(grid_window, pw - x); + const int dst_0 = dst; + // group all tokens belong to the same window togather (to a continue range) + for (int dy = 0; dy < win_h; dy++) { + for (int dx = 0; dx < win_w; dx++) { + const int src = (y + dy) * pw + (x + dx); + GGML_ASSERT(src < (int)idx.size()); + GGML_ASSERT(dst < (int)inv_idx.size()); + idx [src] = dst; + inv_idx[dst] = src; + dst++; + } + } - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - const int mpow = merge_ratio * merge_ratio; - std::vector positions_data(ggml_nelements(positions)); - int * data = positions_data.data(); + for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { + int row_offset = mask_row * (ipw * iph); + std::fill( + mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), + mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), + 0.0); + mask_row++; + } + } + } - int ptr = 0; - for (int y = 0; y < iph; y += merge_ratio) - { - for (int x = 0; x < ipw; x += merge_ratio) - { - for (int dy = 0; dy < 2; dy++) { - for (int dx = 0; dx < 2; dx++) { - auto remap = idx[ptr / mpow]; - remap = remap * mpow + (ptr % mpow); - - data[ remap] = y + dy; - data[ num_patches + remap] = x + dx; - data[2 * num_patches + remap] = y + dy; - data[3 * num_patches + remap] = x + dx; - ptr++; + set_input_i32("window_idx", idx); + set_input_i32("inv_window_idx", inv_idx); + set_input_f32("window_mask", mask); + } else { + for (int i = 0; i < ph * pw; i++) { + idx[i] = i; + } + } + + const int mpow = merge_ratio * merge_ratio; + std::vector positions(num_positions * 4); + + int ptr = 0; + for (int y = 0; y < iph; y += merge_ratio) { + for (int x = 0; x < ipw; x += merge_ratio) { + for (int dy = 0; dy < 2; dy++) { + for (int dx = 0; dx < 2; dx++) { + auto remap = idx[ptr / mpow]; + remap = (remap * mpow) + (ptr % mpow); + + positions[ remap] = y + dy; + positions[ num_patches + remap] = x + dx; + positions[2 * num_patches + remap] = y + dy; + positions[3 * num_patches + remap] = x + dx; + ptr++; + } } } } - } - ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions)); - } - else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - // do nothing - } - else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { - // do nothing - } - else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) { - // set the 2D positions - int n_patches_per_col = image_size_width / patch_size; - std::vector pos_data(num_positions); - struct ggml_tensor * pos; - // dimension H - pos = ggml_graph_get_tensor(gf, "pos_h"); - for (int i = 0; i < num_positions; i++) { - pos_data[i] = i / n_patches_per_col; - } - ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos)); - // dimension W - pos = ggml_graph_get_tensor(gf, "pos_w"); - for (int i = 0; i < num_positions; i++) { - pos_data[i] = i % n_patches_per_col; - } - ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos)); - } - else { + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_PIXTRAL: + { + // set the 2D positions + int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(num_positions); + // dimension H + for (int i = 0; i < num_positions; i++) { + pos_data[i] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // dimension W + for (int i = 0; i < num_positions; i++) { + pos_data[i] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + } break; + case PROJECTOR_TYPE_GLM_EDGE: + { // llava and other models - struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); - - int* positions_data = (int*)malloc(ggml_nbytes(positions)); + std::vector positions(num_positions); for (int i = 0; i < num_positions; i++) { - positions_data[i] = i; + positions[i] = i; } - ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); - free(positions_data); + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + { + // llava and other models + std::vector positions(num_positions); + for (int i = 0; i < num_positions; i++) { + positions[i] = i; + } + set_input_i32("positions", positions); - if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) { - struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); // The patches vector is used to get rows to index into the embeds with; // we should skip dim 0 only if we have CLS to avoid going out of bounds // when retrieving the rows. int patch_offset = model.class_embedding ? 1 : 0; - int* patches_data = (int*)malloc(ggml_nbytes(patches)); + std::vector patches(num_patches); for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + patch_offset; + patches[i] = i + patch_offset; } - ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); - free(patches_data); - } - } - } - - if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) { - struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx"); - struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx"); - struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask"); - - const int merge_ratio = 2; - const int attn_window_size = 112; - const int pw = image_size_width / patch_size / merge_ratio; - const int ph = image_size_height / patch_size / merge_ratio; - const int grid_window = attn_window_size / patch_size / merge_ratio; - const int ipw = image_size_width / patch_size; - const int iph = image_size_height / patch_size; - /* - pw * ph = number of tokens output by ViT after apply patch merger - ipw * ipw = number of vision token been processed inside ViT - */ - - std::vector idx(ph * pw); - std::vector inv_idx(ph * pw); - int dst = 0; - // [num_vision_tokens, num_vision_tokens] attention mask tensor - std::vector mask(pow(ipw * iph, 2), std::numeric_limits::lowest()); - int mask_row = 0; - - for (int y = 0; y < ph; y+=grid_window) - { - for (int x = 0; x < pw; x+=grid_window) + set_input_i32("patches", patches); + } break; + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: { - const int win_h = std::min(grid_window, ph - y); - const int win_w = std::min(grid_window, pw - x); - const int dst_0 = dst; - // group all tokens belong to the same window togather (to a continue range) - for (int dy = 0; dy < win_h; dy++) { - for (int dx = 0; dx < win_w; dx++) { - const int src = (y + dy) * pw + (x + dx); - assert(src < (int)idx.size()); - assert(dst < (int)inv_idx.size()); - idx[src] = dst; - inv_idx[dst] = src; - dst++; - } - } - - for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) { - int row_offset = mask_row * (ipw * iph); - std::fill( - mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio), - mask.begin() + row_offset + (dst * merge_ratio * merge_ratio), - 0.0); - mask_row++; - } - } - } - - ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx)); - ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx)); - ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask)); + // do nothing + } break; + default: + GGML_ABORT("Unknown projector type"); } ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); @@ -3537,7 +3494,7 @@ bool clip_is_glm(const struct clip_ctx * ctx) { } bool clip_is_qwen2vl(const struct clip_ctx * ctx) { - return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL; + return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL; } bool clip_is_llava(const struct clip_ctx * ctx) { From d2b2031e5f11b826dcc718138642f147a2009665 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 28 Apr 2025 14:20:56 +0200 Subject: [PATCH 05/13] llama : (mrope) allow using normal 1D position for text token (#13138) * llama : (mrope) use normal position for text token * rm n_pos_per_embd from llm_graph_input_attn_temp --- examples/llava/qwen2vl-cli.cpp | 8 -------- src/llama-graph.cpp | 26 +++++++++++++++++++------- src/llama-graph.h | 12 +++++------- 3 files changed, 24 insertions(+), 22 deletions(-) diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index cf427108691..1e54851ea07 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -92,20 +92,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past, int * st_pos_id) { int N = (int) tokens.size(); - std::vector pos; for (int i = 0; i < N; i += n_batch) { int n_eval = (int) tokens.size() - i; if (n_eval > n_batch) { n_eval = n_batch; } auto batch = llama_batch_get_one(&tokens[i], n_eval); - // TODO: add mrope pos ids somewhere else - pos.resize(batch.n_tokens * 4); - std::fill(pos.begin(), pos.end(), 0); - for (int j = 0; j < batch.n_tokens * 3; j ++) { - pos[j] = *st_pos_id + (j % batch.n_tokens); - } - batch.pos = pos.data(); if (llama_decode(ctx_llama, batch)) { LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b52e3f6203a..e6595fb18bc 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos)); + if (ubatch->token && n_pos_per_embd > 1) { + // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D + // the other dimensions are all 0, they are unused for text tokens + std::vector pos_data(n_tokens*n_pos_per_embd, 0); + // copy the first dimension + for (int i = 0; i < n_tokens; ++i) { + pos_data[i] = ubatch->pos[i]; + } + ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); + } else { + ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos)); + } } } @@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { ) * f_attn_temp_scale + 1.0; } - ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale)); + ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale)); } } @@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : res (std::make_unique()) { } -int64_t llm_graph_context::n_pos_per_token() const { +int64_t llm_graph_context::n_pos_per_embd() const { return arch == LLM_ARCH_QWEN2VL ? 4 : 1; } @@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { } ggml_tensor * llm_graph_context::build_inp_pos() const { - auto inp = std::make_unique(n_pos_per_token()); + auto inp = std::make_unique(n_pos_per_embd()); auto & cur = inp->pos; - cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token()); + cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd()); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); auto & cur = inp->attn_scale; - cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token()); + // this need to be 1x1xN for broadcasting + cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens); ggml_set_input(cur); res->add_input(std::move(inp)); diff --git a/src/llama-graph.h b/src/llama-graph.h index d192dc14957..d0c8d321927 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -90,29 +90,27 @@ class llm_graph_input_embd : public llm_graph_input_i { class llm_graph_input_pos : public llm_graph_input_i { public: - llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {} + llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} virtual ~llm_graph_input_pos() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * pos = nullptr; // I32 [n_batch] - const int64_t n_pos_per_token = 1; + const int64_t n_pos_per_embd = 1; }; // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; ggml_tensor * attn_scale = nullptr; // F32 [n_batch] - const int64_t n_pos_per_token = 1; - const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; }; @@ -419,7 +417,7 @@ struct llm_graph_context { llm_graph_context(const llm_graph_params & params); - int64_t n_pos_per_token() const; + int64_t n_pos_per_embd() const; void cb(ggml_tensor * cur, const char * name, int il) const; From fb0471d1753824e75474c24f82fbdd54c94dceda Mon Sep 17 00:00:00 2001 From: pockers21 <134406831+pockers21@users.noreply.github.com> Date: Mon, 28 Apr 2025 06:45:40 -0700 Subject: [PATCH 06/13] context : do not clear output buffer on reserve (#13152) Co-authored-by: pockers21 --- src/llama-context.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a52b6850b46..e49225aa224 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1536,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) { // set all ids as invalid (negative) std::fill(output_ids.begin(), output_ids.end(), -1); - ggml_backend_buffer_clear(buf_output.get(), 0); - this->n_outputs = 0; this->n_outputs_max = n_outputs_max; From 4e87962e34a4b257ec374c4baf6b1568554b81a9 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 28 Apr 2025 16:12:56 +0200 Subject: [PATCH 07/13] mtmd : fix glm-edge redundant token count (#13139) * mtmd : fix glm-edge redundant token count * fix chat template * temporary disable GLMEdge test chat tmpl --- examples/llava/mtmd.cpp | 10 +--------- src/llama-chat.cpp | 10 +--------- tests/test-chat-template.cpp | 17 +++++++++-------- 3 files changed, 11 insertions(+), 26 deletions(-) diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index a994ef0166e..f95f0503569 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -203,9 +203,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix - // for glm-edge, we don't need to add because the tokens are already in the returned embeddings - - // TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens std::vector parts = string_split_str(prompt_modified, ctx->image_marker); output.clear(); @@ -246,7 +243,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, }; for (const auto & part : parts) { - //printf("tokenizing part: %s\n", part.c_str()); + // printf("tokenizing part: %s\n", part.c_str()); bool add_bos = &parts.front() == ∂ auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special); if (tokens.empty()) { @@ -338,11 +335,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx, LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); - if (clip_is_glm(ctx->ctx_clip)) { - // glm-edge - image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings - } - mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index af5e2003198..735d2619c92 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -447,7 +447,7 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) { + } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { ss << "[gMASK]" << ""; for (auto message : chat) { std::string role(message->role); @@ -456,14 +456,6 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|assistant|>"; } - } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) { - for (auto message : chat) { - std::string role(message->role); - ss << "<|" << role << "|>" << "\n" << message->content; - } - if (add_ass) { - ss << "<|assistant|>"; - } } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) { // MiniCPM-3B-OpenHermes-2.5-v2-GGUF for (auto message : chat) { diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index be1a640068d..85d89843d6d 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -187,14 +187,15 @@ int main(void) { /* .bos_token= */ "", /* .eos_token= */ "", }, - { - /* .name= */ "GLMEdge", - /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>", - /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>", - /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>", - /* .bos_token= */ "", - /* .eos_token= */ "", - }, + // TODO @ngxson : GLMEdge produces poor result without `[gMASK]`, so we're temporarily using GLM4 template for it. We should fix this in the future. + // { + // /* .name= */ "GLMEdge", + // /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>", + // /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>", + // /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>", + // /* .bos_token= */ "", + // /* .eos_token= */ "", + // }, { /* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF", /* .template_str= */ U8C("{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"), From 1831f538f720d1d99fba146f24f0a8e970838cc4 Mon Sep 17 00:00:00 2001 From: Vishal Agarwal Date: Mon, 28 Apr 2025 20:20:39 +0530 Subject: [PATCH 08/13] llama-bench: add `-d` depth arg (#13096) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add depth param * update llama-bench README and add depth param * llama-bench: default params for depth arg for faster execution * Update examples/llama-bench/README.md Co-authored-by: Johannes Gäßler * fix buffer print ub * use user provided args * remove extra whitespaces --------- Co-authored-by: Johannes Gäßler --- examples/llama-bench/README.md | 155 +++++++++++++++++---------- examples/llama-bench/llama-bench.cpp | 47 ++++++-- 2 files changed, 137 insertions(+), 65 deletions(-) diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 6bbe4bb75fb..1f5e2f66200 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -28,6 +28,7 @@ options: -p, --n-prompt (default: 512) -n, --n-gen (default: 128) -pg (default: ) + -d, --n-depth (default: 0) -b, --batch-size (default: 2048) -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) @@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition. +Using the `-d ` option, each test can be run at a specified context depth, prefilling the KV cache with `` tokens. + For a description of the other options, see the [main example](../main/README.md). Note: @@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35 | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | pp 512 | 2400.01 ± 7.72 | | llama 7B mostly Q4_0 | 3.56 GiB | 6.74 B | CUDA | 35 | tg 128 | 131.66 ± 0.49 | +### Different prefilled context + +``` +$ ./llama-bench -d 0,512 +``` + +| model | size | params | backend | ngl | test | t/s | +| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 | 7340.20 ± 23.45 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 | 120.60 ± 0.59 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | pp512 @ d512 | 6425.91 ± 18.88 | +| qwen2 7B Q4_K - Medium | 4.36 GiB | 7.62 B | CUDA | 99 | tg128 @ d512 | 116.71 ± 0.60 | + ## Output formats By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option. @@ -170,9 +186,9 @@ $ ./llama-bench -o csv ``` ```csv -build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961" -"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342" +build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434" +"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617" ``` ### JSON @@ -184,64 +200,78 @@ $ ./llama-bench -o json ```json [ { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "8cf427ff", + "build_number": 5163, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", + "model_type": "qwen2 7B Q4_K - Medium", + "model_size": 4677120000, + "model_n_params": 7615616512, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, "n_prompt": 512, "n_gen": 0, - "test_time": "2023-09-23T12:09:57Z", - "avg_ns": 212365953, - "stddev_ns": 985423, - "avg_ts": 2410.974041, - "stddev_ts": 11.163766, - "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ], - "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ] + "n_depth": 0, + "test_time": "2025-04-24T11:58:50Z", + "avg_ns": 72135640, + "stddev_ns": 1453752, + "avg_ts": 7100.002165, + "stddev_ts": 140.341520, + "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ], + "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ] }, { - "build_commit": "3469684", - "build_number": 1275, - "cuda": true, - "metal": false, - "gpu_blas": true, - "blas": true, - "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K", - "gpu_info": "NVIDIA GeForce RTX 3090 Ti", - "model_filename": "models/7B/ggml-model-q4_0.gguf", - "model_type": "llama 7B mostly Q4_0", - "model_size": 3825065984, - "model_n_params": 6738415616, - "n_batch": 512, - "n_threads": 16, - "f16_kv": true, + "build_commit": "8cf427ff", + "build_number": 5163, + "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", + "gpu_info": "NVIDIA GeForce RTX 4080", + "backends": "CUDA", + "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", + "model_type": "qwen2 7B Q4_K - Medium", + "model_size": 4677120000, + "model_n_params": 7615616512, + "n_batch": 2048, + "n_ubatch": 512, + "n_threads": 8, + "cpu_mask": "0x0", + "cpu_strict": false, + "poll": 50, + "type_k": "f16", + "type_v": "f16", "n_gpu_layers": 99, + "split_mode": "layer", "main_gpu": 0, - "mul_mat_q": true, + "no_kv_offload": false, + "flash_attn": false, "tensor_split": "0.00", + "use_mmap": true, + "embeddings": false, "n_prompt": 0, "n_gen": 128, - "test_time": "2023-09-23T12:09:59Z", - "avg_ns": 977425219, - "stddev_ns": 9268593, - "avg_ts": 130.965708, - "stddev_ts": 1.238924, - "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ], - "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ] + "n_depth": 0, + "test_time": "2025-04-24T11:58:51Z", + "avg_ns": 1076767880, + "stddev_ns": 9449585, + "avg_ts": 118.881588, + "stddev_ts": 1.041811, + "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ], + "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ] } ] ``` @@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl ``` ```json lines -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]} -{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]} +{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]} ``` @@ -271,25 +301,32 @@ $ ./llama-bench -o sql CREATE TABLE IF NOT EXISTS test ( build_commit TEXT, build_number INTEGER, - cuda INTEGER, - metal INTEGER, - gpu_blas INTEGER, - blas INTEGER, cpu_info TEXT, gpu_info TEXT, + backends TEXT, model_filename TEXT, model_type TEXT, model_size INTEGER, model_n_params INTEGER, n_batch INTEGER, + n_ubatch INTEGER, n_threads INTEGER, - f16_kv INTEGER, + cpu_mask TEXT, + cpu_strict INTEGER, + poll INTEGER, + type_k TEXT, + type_v TEXT, n_gpu_layers INTEGER, + split_mode TEXT, main_gpu INTEGER, - mul_mat_q INTEGER, + no_kv_offload INTEGER, + flash_attn INTEGER, tensor_split TEXT, + use_mmap INTEGER, + embeddings INTEGER, n_prompt INTEGER, n_gen INTEGER, + n_depth INTEGER, test_time TEXT, avg_ns INTEGER, stddev_ns INTEGER, @@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test ( stddev_ts REAL ); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634'); -INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613'); +INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647'); ``` diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 564a51bfd7b..5a78216e44f 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -200,6 +200,7 @@ struct cmd_params { std::vector n_prompt; std::vector n_gen; std::vector> n_pg; + std::vector n_depth; std::vector n_batch; std::vector n_ubatch; std::vector type_k; @@ -233,6 +234,7 @@ static const cmd_params cmd_params_defaults = { /* n_prompt */ { 512 }, /* n_gen */ { 128 }, /* n_pg */ {}, + /* n_depth */ { 0 }, /* n_batch */ { 2048 }, /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, @@ -272,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -d, --n-depth (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str()); printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -ub, --ubatch-size (default: %s)\n", @@ -409,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + } else if (arg == "-d" || arg == "--n-depth") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.n_depth.insert(params.n_depth.end(), p.begin(), p.end()); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -739,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; } + if (params.n_depth.empty()) { + params.n_depth = cmd_params_defaults.n_depth; + } if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; } @@ -801,6 +814,7 @@ struct cmd_params_instance { std::string model; int n_prompt; int n_gen; + int n_depth; int n_batch; int n_ubatch; ggml_type type_k; @@ -880,7 +894,7 @@ struct cmd_params_instance { llama_context_params to_llama_cparams() const { llama_context_params cparams = llama_context_default_params(); - cparams.n_ctx = n_prompt + n_gen; + cparams.n_ctx = n_prompt + n_gen + n_depth; cparams.n_batch = n_batch; cparams.n_ubatch = n_ubatch; cparams.type_k = type_k; @@ -916,6 +930,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nt : params.n_threads) for (const auto & cm : params.cpu_mask) for (const auto & cs : params.cpu_strict) + for (const auto & nd : params.n_depth) for (const auto & pl : params.poll) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -925,6 +940,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .model = */ m, /* .n_prompt = */ n_prompt, /* .n_gen = */ 0, + /* .n_depth = */ nd, /* .n_batch = */ nb, /* .n_ubatch = */ nub, /* .type_k = */ tk, @@ -955,6 +971,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .model = */ m, /* .n_prompt = */ 0, /* .n_gen = */ n_gen, + /* .n_depth = */ nd, /* .n_batch = */ nb, /* .n_ubatch = */ nub, /* .type_k = */ tk, @@ -985,6 +1002,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .model = */ m, /* .n_prompt = */ n_pg.first, /* .n_gen = */ n_pg.second, + /* .n_depth = */ nd, /* .n_batch = */ nb, /* .n_ubatch = */ nub, /* .type_k = */ tk, @@ -1040,6 +1058,7 @@ struct test { bool embeddings; int n_prompt; int n_gen; + int n_depth; std::string test_time; std::vector samples_ns; @@ -1072,6 +1091,7 @@ struct test { embeddings = inst.embeddings; n_prompt = inst.n_prompt; n_gen = inst.n_gen; + n_depth = inst.n_depth; // RFC 3339 date-time format time_t t = time(NULL); std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t)); @@ -1113,9 +1133,11 @@ struct test { "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", + "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", + "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", - "stddev_ns", "avg_ts", "stddev_ts", + "use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", + "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; } @@ -1125,8 +1147,8 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" || - field == "stddev_ns") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || + field == "avg_ns" || field == "stddev_ns") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1204,6 +1226,7 @@ struct test { std::to_string(embeddings), std::to_string(n_prompt), std::to_string(n_gen), + std::to_string(n_depth), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), @@ -1381,7 +1404,7 @@ struct markdown_printer : public printer { return 4; } if (field == "test") { - return 13; + return 15; } int width = std::max((int) field.length(), 10); @@ -1531,6 +1554,10 @@ struct markdown_printer : public printer { } else { snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen); } + if (t.n_depth > 0) { + int len = strlen(buf); + snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth); + } value = buf; } else if (field == "t/s") { snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); @@ -1789,6 +1816,14 @@ int main(int argc, char ** argv) { for (int i = 0; i < params.reps; i++) { llama_kv_self_clear(ctx); + if (t.n_depth > 0) { + if (params.progress) { + fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count, + i + 1, params.reps); + } + test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads); + } + uint64_t t_start = get_time_ns(); if (t.n_prompt > 0) { From 43ddab6eeeaab5a04fe5a364af0bafb0e4d35065 Mon Sep 17 00:00:00 2001 From: Ville Vesilehto Date: Mon, 28 Apr 2025 21:00:20 +0300 Subject: [PATCH 09/13] fix(rpc): Improve input validation and error handling (#13069) * fix(rpc): Improve input validation and error handling The `rpc-server` was vulnerable to Denial of Service attacks via several RPC commands (`SET_TENSOR`, `GRAPH_COMPUTE`, etc.). Malformed messages could trigger failed assertions (e.g., invalid `ggml_type`) or out-of-bounds reads/writes leading to `GGML_ABORT` calls, crashing the server process. This PR introduces robust input validation and replaces `abort()` calls with graceful error handling: - **Type Validation:** `deserialize_tensor` now checks if the `tensor->type` is within the valid `GGML_TYPE_COUNT` range *before* calling `ggml_new_tensor_4d`. Returns `nullptr` on invalid type. - **Bounds Checks:** Replaced `GGML_ABORT` in `set_tensor`, `set_tensor_hash`, and `get_tensor` handlers with error logging and returning `false` when data/offset parameters are out of buffer bounds. - **Size Checks:** Added safe arithmetic checks (for overflow) in `graph_compute` when calculating required message sizes based on client-provided `n_nodes` and `n_tensors`. Returns early if the reported sizes conflict with the actual message size or would lead to overflow. - **Error Propagation:** - `create_node` now checks for `nullptr` return values from `deserialize_tensor` and its recursive calls, propagating `nullptr` upwards on failure. Uses `find` instead of `at` for safer map access. - `copy_tensor` now checks for `nullptr` from `deserialize_tensor` and sets the response status to failure if deserialization or bounds checks fail. - `graph_compute` now checks for `nullptr` return from `create_node` and returns failure status correctly. The final return value now reflects the actual computation status. These changes improve the RPC server's resilience against malformed client requests, preventing crashes and ensuring errors are handled more gracefully. Signed-off-by: Ville Vesilehto * refactor(rpc): address pr comments removed comments and unnecessary returns Signed-off-by: Ville Vesilehto * refactor(rpc): ambiguous nullptr from create_node rpc_server::create_node could previously return nullptr if the input ID was 0 (valid) or if an internal error (deserialization, recursion failure) occurred (invalid). This ambiguity made error handling difficult for the caller (`graph_compute`). This commit clarifies the meaning of nullptr: - `graph_compute` now checks if the input 'id' was non-zero when `create_node` returns nullptr, correctly identifying failures versus intentional null links. - `create_node` avoids recursive calls for zero IDs and propagates nullptr unambiguously on failure during recursion. Signed-off-by: Ville Vesilehto * refactor(rpc): initial zero check in create_node The caller (`graph_compute`) already checks `id != 0` when handling a `nullptr` return from `create_node`, correctly distinguishing intentional null links from actual errors. This makes the initial `if (id == 0)` check redundant. Also removes the log message when a tensor ID is not found in the provided map which was added in this branch. Signed-off-by: Ville Vesilehto * fix(rpc): Handle get_alloc_size failure in server Check the return value of `server.get_alloc_size` in the RPC server loop. If the call fails, return early to close the connection. Signed-off-by: Ville Vesilehto * refactor(rpc): input size validation in graph_compute Removes detailed, step-by-step size calculations and overflow checks in favor of simpler direct comparisons, assuming 64-bit overflow is unlikely. Signed-off-by: Ville Vesilehto * refactor(rpc): remove extra status code setting Removes the explicit setting of `response.result = GGML_STATUS_FAILED` when `create_node` returns `nullptr` within `graph_compute`. Primary signal is the `false` return value in case of failure. Signed-off-by: Ville Vesilehto * refactor(rpc): remove redundant check for tensor->type Breaks CI on ubuntu-cpu-make. Tensor type is uint32_t, thus the check is not needed. Signed-off-by: Ville Vesilehto --------- Signed-off-by: Ville Vesilehto --- ggml/src/ggml-rpc/ggml-rpc.cpp | 78 +++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 9023eb09196..140a775f980 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -982,8 +982,21 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) { } ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) { + // Validate tensor type before using it + if (tensor->type >= GGML_TYPE_COUNT) { + GGML_LOG_ERROR("[%s] invalid tensor type received: %u\n", __func__, tensor->type); + return nullptr; + } + ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]); + + // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type + if (result == nullptr) { + GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type); + return nullptr; + } + for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) { result->nb[i] = tensor->nb[i]; } @@ -1043,7 +1056,9 @@ bool rpc_server::set_tensor(const std::vector & input) { const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { - GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu) out of buffer bounds [0x%zx, 0x%zx)\n", + __func__, in_tensor->data, offset, size, p0, p1); + return false; } } @@ -1118,7 +1133,9 @@ bool rpc_server::set_tensor_hash(const std::vector & input, rpc_msg_set const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer); if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) { - GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n", + __func__, in_tensor->data, offset, size, *hash, p0, p1); + return false; } } ggml_backend_tensor_set(tensor, cached_file.data(), offset, size); @@ -1183,7 +1200,9 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector< if (request.tensor.data + request.offset < p0 || request.tensor.data + request.offset >= p1 || request.size > (p1 - request.tensor.data - request.offset)) { - GGML_ABORT("[%s] tensor->data out of bounds\n", __func__); + GGML_LOG_ERROR("[%s] requested tensor region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%" PRIu64 ") out of buffer bounds [0x%zx, 0x%zx)\n", + __func__, request.tensor.data, request.offset, request.size, p0, p1); + return false; } } @@ -1237,22 +1256,50 @@ ggml_tensor * rpc_server::create_node(uint64_t id, struct ggml_context * ctx, const std::unordered_map & tensor_ptrs, std::unordered_map & tensor_map) { - if (id == 0) { - return nullptr; - } if (tensor_map.find(id) != tensor_map.end()) { return tensor_map[id]; } - const rpc_tensor * tensor = tensor_ptrs.at(id); + // Safely find the tensor pointer + auto it_ptr = tensor_ptrs.find(id); + if (it_ptr == tensor_ptrs.end()) { + return nullptr; + } + const rpc_tensor * tensor = it_ptr->second; + struct ggml_tensor * result = deserialize_tensor(ctx, tensor); if (result == nullptr) { return nullptr; } tensor_map[id] = result; for (int i = 0; i < GGML_MAX_SRC; i++) { - result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + // Check if the source ID is 0 before calling create_node recursively + if (tensor->src[i] == 0) { + result->src[i] = nullptr; + } else { + result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map); + // If the recursive call failed for a non-zero ID, propagate the error + if (result->src[i] == nullptr) { + GGML_LOG_ERROR("[%s] failed to create source node %d (src_id=%" PRIu64 ") for node id %" PRIu64 "\n", + __func__, i, tensor->src[i], id); + // Must return nullptr to signal failure up the call stack + return nullptr; + } + } + } + + // Handle view_src similarly + if (tensor->view_src == 0) { + result->view_src = nullptr; + } else { + result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); + // If the recursive call failed for a non-zero ID, propagate the error + if (result->view_src == nullptr) { + GGML_LOG_ERROR("[%s] failed to create view_src node (view_src_id=%" PRIu64 ") for node id %" PRIu64 "\n", + __func__, tensor->view_src, id); + // Must return nullptr to signal failure up the call stack + return nullptr; + } } - result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map); result->view_offs = tensor->view_offs; return result; } @@ -1278,6 +1325,7 @@ bool rpc_server::graph_compute(const std::vector & input, rpc_msg_graph GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors); size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false); + struct ggml_init_params params = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ NULL, @@ -1297,6 +1345,14 @@ bool rpc_server::graph_compute(const std::vector & input, rpc_msg_graph int64_t id; memcpy(&id, &nodes[i], sizeof(id)); graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map); + + // Check if create_node failed for a *non-zero* ID. + // If id was 0, create_node returning nullptr is expected. + // If id was non-zero and create_node returned nullptr, it indicates a deserialization error. + if (graph->nodes[i] == nullptr && id != 0) { + GGML_LOG_ERROR("[%s] failed to create graph node %d (id=%" PRId64 ")\n", __func__, i, id); + return false; + } } ggml_status status = ggml_backend_graph_compute(backend, graph); response.result = status; @@ -1361,7 +1417,9 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir, return; } rpc_msg_get_alloc_size_rsp response; - server.get_alloc_size(request, response); + if (!server.get_alloc_size(request, response)) { + return; + } if (!send_msg(sockfd, &response, sizeof(response))) { return; } From eaea3253244dc4bbe07f6cd81325847ccc6cf93e Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 28 Apr 2025 21:23:19 +0200 Subject: [PATCH 10/13] clip : fix model size display (#13153) --- examples/llava/clip.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 8c5d56cc17a..a5eb55f4d41 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1574,7 +1574,7 @@ struct clip_model_loader { clip_ctx & ctx_clip; std::string fname; - size_t model_size; // in bytes + size_t model_size = 0; // in bytes // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) { @@ -1748,6 +1748,8 @@ struct clip_model_loader { LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version); LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor); LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern); + LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu); + LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } From 5f5e39e1ba5dbea814e41f2a15e035d749a520bc Mon Sep 17 00:00:00 2001 From: AT Date: Mon, 28 Apr 2025 15:52:15 -0400 Subject: [PATCH 11/13] model : Nomic Embed Text V2 with Mixture-of-Experts (MoE) architecture (#12466) * Nomic Embed Text V2 with Mixture-of-Experts (MoE) architecture - Adds MoE-based embedding model supporting multilingual embeddings. - Selects architecture variant based on hyperparameter detection (MoE layers). - Removes unnecessary subclass initialization checks for clarity. https://www.nomic.ai/blog/posts/nomic-embed-text-v2 Co-authored-by: Jared Van Bortel * fix tokenizer * don't rename this tensor --------- Co-authored-by: Jared Van Bortel --- convert_hf_to_gguf.py | 227 ++++++++++++++++++++------------- gguf-py/gguf/constants.py | 19 +++ gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/tensor_mapping.py | 4 + src/llama-arch.cpp | 20 +++ src/llama-arch.h | 2 + src/llama-graph.cpp | 25 ++-- src/llama-hparams.h | 1 + src/llama-model.cpp | 56 ++++++-- 9 files changed, 247 insertions(+), 110 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d4fec408dd2..b9cea7e4699 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -78,7 +78,7 @@ class ModelBase: # subclasses should define this! model_arch: gguf.MODEL_ARCH - def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False, + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, @@ -454,13 +454,6 @@ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type class TextModel(ModelBase): - @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - def set_vocab(self): self._set_vocab_gpt2() @@ -3373,14 +3366,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] - -@ModelBase.register("RobertaModel") -class RobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - + def _xlmroberta_tokenizer_init(self) -> None: # we need the pad_token_id to know how to chop down position_embd matrix if (pad_token_id := self.hparams.get("pad_token_id")) is not None: self._position_offset = 1 + pad_token_id @@ -3389,82 +3375,7 @@ def __init__(self, *args, **kwargs): else: self._position_offset = None - def set_vocab(self): - """Support BPE tokenizers for roberta models""" - bpe_tok_path = self.dir_model / "tokenizer.json" - if bpe_tok_path.exists(): - self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(True) - - # we need this to validate the size of the token_type embeddings - # though currently we are passing all zeros to the token_type embeddings - # "Sequence A" or "Sequence B" - self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) - - else: - return super().set_vocab() - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # if name starts with "roberta.", remove the prefix - # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main - if name.startswith("roberta."): - name = name[8:] - - # position embeddings start at pad_token_id + 1, so just chop down the weight tensor - if name == "embeddings.position_embeddings.weight": - if self._position_offset is not None: - data_torch = data_torch[self._position_offset:,:] - - return super().modify_tensors(data_torch, name, bid) - - -@ModelBase.register("NomicBertModel") -class NomicBertModel(BertModel): - model_arch = gguf.MODEL_ARCH.NOMIC_BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # the HF config claims n_ctx=8192, but it uses RoPE scaling - self.hparams["n_ctx"] = 2048 - - # SwigLU activation - assert self.hparams["activation_function"] == "swiglu" - # this doesn't do anything in the HF version - assert self.hparams["causal"] is False - # no bias tensors - assert self.hparams["qkv_proj_bias"] is False - assert self.hparams["mlp_fc1_bias"] is False - assert self.hparams["mlp_fc2_bias"] is False - # norm at end of layer - assert self.hparams["prenorm"] is False - # standard RoPE - assert self.hparams["rotary_emb_fraction"] == 1.0 - assert self.hparams["rotary_emb_interleaved"] is False - assert self.hparams["rotary_emb_scale_base"] is None - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - - -@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") -class XLMRobertaModel(BertModel): - model_arch = gguf.MODEL_ARCH.BERT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # we need the pad_token_id to know how to chop down position_embd matrix - if (pad_token_id := self.hparams.get("pad_token_id")) is not None: - self._position_offset = 1 + pad_token_id - if "max_position_embeddings" in self.hparams: - self.hparams["max_position_embeddings"] -= self._position_offset - else: - self._position_offset = None - - def set_vocab(self): + def _xlmroberta_set_vocab(self) -> None: # to avoid TypeError: Descriptors cannot be created directly # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" @@ -3546,6 +3457,138 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) + +@ModelBase.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("NomicBertModel") +class NomicBertModel(BertModel): + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model) + + self.is_moe = bool(hparams.get("moe_every_n_layers")) + self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) + + self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta() + if self._tokenizer_is_xlmroberta: + self._xlmroberta_tokenizer_init() + + # the HF config claims n_ctx=8192, but it uses RoPE scaling + self.hparams["n_ctx"] = 2048 + + assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu" + + # this doesn't do anything in the HF version + assert self.hparams["causal"] is False + # no bias tensors unless MoE + assert self.hparams["qkv_proj_bias"] == self.is_moe + assert self.hparams["mlp_fc1_bias"] == self.is_moe + assert self.hparams["mlp_fc2_bias"] == self.is_moe + + # norm at end of layer + assert self.hparams["prenorm"] is False + # standard RoPE + assert self.hparams["rotary_emb_fraction"] == 1.0 + assert self.hparams["rotary_emb_interleaved"] is False + assert self.hparams["rotary_emb_scale_base"] is None + + def set_vocab(self) -> None: + if self._tokenizer_is_xlmroberta: + return self._xlmroberta_set_vocab() + return super().set_vocab() + + def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: + # If the tensor is an experts bias tensor, skip it by returning an empty list. + if "mlp.experts.bias" in name: + return [] # Explicitly return an empty list. + + if "mlp.experts.mlp.w1" in name: + data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) + name += ".weight" + + if "mlp.experts.mlp.w2" in name: + data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) + data_torch = data_torch.transpose(1, 2) + name += ".weight" + + return [(self.map_tensor_name(name), data_torch)] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + if self.is_moe: + self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) + self.gguf_writer.add_expert_count(self.hparams["num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + + def _is_tokenizer_xlmroberta(self) -> bool: + with open(self.dir_model / "tokenizer.json") as f: + tokenizer_json = json.load(f) + toktyp = tokenizer_json["model"]["type"] + if toktyp == "Unigram": + return True + if toktyp == "WordPiece": + return False + raise ValueError(f"unknown tokenizer: {toktyp}") + + +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +class XLMRobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._xlmroberta_tokenizer_init() + + def set_vocab(self): + self._xlmroberta_set_vocab() + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # if name starts with "roberta.", remove the prefix # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b81017b1425..326ccdb071a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -104,6 +104,7 @@ class LLM: EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" EXPERT_GATING_FUNC = "{arch}.expert_gating_func" + MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" @@ -267,6 +268,7 @@ class MODEL_ARCH(IntEnum): REFACT = auto() BERT = auto() NOMIC_BERT = auto() + NOMIC_BERT_MOE = auto() JINA_BERT_V2 = auto() BLOOM = auto() STABLELM = auto() @@ -521,6 +523,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", @@ -960,6 +963,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.NOMIC_BERT_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.TOKEN_TYPES, + MODEL_TENSOR.POS_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.JINA_BERT_V2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 48e9a470b78..f22a6d4a347 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -728,6 +728,9 @@ def add_expert_weights_norm(self, value: bool) -> None: def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) + def add_moe_every_n_layers(self, value: int) -> None: + self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value) + def add_swin_norm(self, value: bool) -> None: self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 1d70551973b..311d1ff69c7 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -290,6 +290,7 @@ class TensorNameMap: "transformer.blocks.{bid}.ffn.router.layer", # dbrx "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe "language_model.model.layers.{bid}.feed_forward.router", # llama4 + "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -322,6 +323,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "encoder.layers.{bid}.mlp.fc1", # nomic-bert-moe "model.layers.{bid}.mlp.c_fc", # starcoder2 "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 "model.layers.{bid}.residual_mlp.w3", # arctic @@ -337,6 +339,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged) "language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4 + "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -418,6 +421,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe "model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged) "language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4 + "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 62e1480bb58..f2bc8ca7685 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -19,6 +19,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_REFACT, "refact" }, { LLM_ARCH_BERT, "bert" }, { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, @@ -106,6 +107,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, + { LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" }, { LLM_KV_POOLING_TYPE, "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, @@ -472,6 +474,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_NOMIC_BERT_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_TOKEN_TYPES, "token_types" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_JINA_BERT_V2, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 98ca00a1bd0..41a023da3da 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -23,6 +23,7 @@ enum llm_arch { LLM_ARCH_REFACT, LLM_ARCH_BERT, LLM_ARCH_NOMIC_BERT, + LLM_ARCH_NOMIC_BERT_MOE, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, @@ -110,6 +111,7 @@ enum llm_kv { LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, + LLM_KV_MOE_EVERY_N_LAYERS, LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e6595fb18bc..2706ea26354 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -925,28 +925,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); - ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(gate, "ffn_moe_gate", il); + ggml_tensor * experts = nullptr; + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } switch (type_op) { case LLM_FFN_SILU: { - gate = ggml_silu(ctx0, gate); - cb(gate, "ffn_moe_silu", il); + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: { - gate = ggml_gelu(ctx0, gate); - cb(gate, "ffn_moe_gelu", il); + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_moe_gelu", il); } break; default: GGML_ABORT("fatal error"); } - ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens] - cb(par, "ffn_moe_gate_par", il); + if (gate_exps) { + cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate_par", il); + } - ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); if (!weight_before_ffn) { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 80fcd65df0d..7ee6a5b75ad 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -66,6 +66,7 @@ struct llama_hparams { float expert_weights_scale = 0.0; bool expert_weights_norm = false; uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; + uint32_t moe_every_n_layers = 0; float f_norm_eps; float f_norm_rms_eps; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index df2791002e9..2ec55d55a37 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -695,10 +695,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } break; case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); if (hparams.n_layer == 12 && hparams.n_embd == 768) { type = LLM_TYPE_137M; @@ -2057,6 +2059,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); @@ -2090,20 +2093,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0); } + if (arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - - if (arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) { layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); } else { - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + + if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) { + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0); + } else { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + } } layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0); @@ -5730,6 +5744,11 @@ struct llm_build_bert : public llm_graph_context { cur = build_lora_mm(model.layers[il].wqkv, cur); cb(cur, "wqkv", il); + if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); @@ -5782,13 +5801,29 @@ struct llm_build_bert : public llm_graph_context { cb(ffn_inp, "ffn_inp", il); // feed-forward network - if (model.arch == LLM_ARCH_BERT) { + if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) { + // MoE branch + cur = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + nullptr, + model.layers[il].ffn_down_exps, + nullptr, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_GELU, + false, false, + 0.0f, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + cb(cur, "ffn_moe_out", il); + } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) { cur = build_ffn(cur, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, NULL, NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_JINA_BERT_V2) { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -5796,6 +5831,7 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } else { cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, @@ -5803,8 +5839,8 @@ struct llm_build_bert : public llm_graph_context { model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); } - cb(cur, "ffn_out", il); // attentions bypass the intermediate layer cur = ggml_add(ctx0, cur, ffn_inp); @@ -12843,6 +12879,7 @@ llm_graph_result_ptr llama_model::build_graph( case LLM_ARCH_BERT: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: { llm = std::make_unique(*this, params, gf); } break; @@ -13201,6 +13238,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_DBRX: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: + case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: From b6ce7430b7eb51f032152316880204e0a9c0470e Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 29 Apr 2025 08:45:49 +0200 Subject: [PATCH 12/13] llama-graph : fix text position for mrope (#13159) * llama-graph : fix text position for mrope * fix typo * explicitly set 4th dim in the loop --- src/llama-graph.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 2706ea26354..fabb9ca2376 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -55,13 +55,16 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; - if (ubatch->token && n_pos_per_embd > 1) { + if (ubatch->token && n_pos_per_embd == 4) { // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D - // the other dimensions are all 0, they are unused for text tokens - std::vector pos_data(n_tokens*n_pos_per_embd, 0); + // the 3 first dims are the same, and 4th dim is all 0 + std::vector pos_data(n_tokens*n_pos_per_embd); // copy the first dimension for (int i = 0; i < n_tokens; ++i) { - pos_data[i] = ubatch->pos[i]; + pos_data[ i] = ubatch->pos[i]; + pos_data[ n_tokens + i] = ubatch->pos[i]; + pos_data[2 * n_tokens + i] = ubatch->pos[i]; + pos_data[3 * n_tokens + i] = 0; // 4th dim is 0 } ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos)); } else { From 52d32c21bc65fc9ea7c6b3224b4825d244af57d7 Mon Sep 17 00:00:00 2001 From: Shalini Salomi Bodapati Date: Tue, 29 Apr 2025 03:10:54 -0500 Subject: [PATCH 13/13] Fix for 13170 Build fails with compilation error on power pc. This patch fixes the same. Tested with unit tests run via --build && cd && make test Signed-off-by: Shalini Salomi Bodapati --- ggml/src/ggml-cpu/ops.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 7413192b746..22ea35dfa68 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -6117,7 +6117,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( #ifdef GGML_SIMD // Vectorized loop for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) { - GGML_F32_VEC sum = GGML_F32_VEC_ZERO; + GGML_F32_VEC sum[1] = {GGML_F32_VEC_ZERO}; for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) { const int64_t src_y = src_y_base + knl_y * p.dilation_y; if (src_y < 0 || src_y >= p.src_h) { @@ -6130,10 +6130,10 @@ static void ggml_compute_forward_conv_2d_dw_cwhn( } GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i); GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i); - sum = GGML_F32_VEC_FMA(sum, k, s); + sum[0] = GGML_F32_VEC_FMA(sum[0], k, s); } } - GGML_F32_VEC_STORE(dst_data + c_i, sum); + GGML_F32_VEC_STORE(dst_data + c_i, sum[0]); } #endif // Scalar loop