llama : remove quantization sanity check (#17788)

danbev · web-flow · commit 444f00b0ec81 · 2025-12-06T12:26:20.000+01:00
* llama : remove quantization sanity check

This commit removes the quantization sanity check for attention layers.

The motivation for this is that there are model that are hybrid models
that have recurrent layers, experts layers, and attention layers.  For
these models the current check fails as the experts layers are not
taking into account. After consideration, it was decided that this check
is not strictly necessary, and can be removed to allow for more flexible
model architectures.

* llama : remove unused pruned_attention_w and is_clip_model vars
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -666,19 +666,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::map<int, std::string> mapped;
     int blk_id = 0;
-    int pruned_attention_w = 0;
 
     // make a list of weights
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
     tensors.reserve(ml.weights_map.size());
     for (const auto & it : ml.weights_map) {
         const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
         if (remapped_name.empty()) {
-            if (it.first.find("attn_v.weight") != std::string::npos ||
-                it.first.find("attn_qkv.weight") != std::string::npos ||
-                it.first.find("attn_kv_b.weight") != std::string::npos) {
-                    pruned_attention_w++;
-            }
             LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
             continue;
         }
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
-    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
 
@@ -717,30 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         }
-
-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
-    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
-    {
-        int32_t n_layer_all = model.hparams.n_layer;
-        if (llama_model_has_encoder(&model)) {
-            // now n_layer_all is the number of attention layers in the encoder
-            // for each decoder block, there are 2 attention layers
-            n_layer_all += 2 * model.hparams.dec_n_layer;
-        }
-
-        // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
-        const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
-
-        LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);
-
-        GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
-    }
-
     size_t total_size_org = 0;
     size_t total_size_new = 0;