Fix NPU

wine99 · wine99 · commit bbecac02ed41 · 2025-11-24T11:36:16.000+08:00
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -311,6 +311,11 @@ void GgmlOvDecoder::set_llm_params() {
             } else {
                 m_attention_size = mask->ne[0];
             }
+            if (m_is_static) {
+                m_attention_size = m_ctx_per_seq;
+                m_attention_size_swa = m_ctx_per_seq_swa;
+                m_token_len_per_seq = 1;
+            }
 
         } else if (node->op == GGML_OP_ROPE) {
             if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) {
@@ -330,7 +335,7 @@ void GgmlOvDecoder::set_llm_params() {
 
 void GgmlOvDecoder::validate_cgraph() const {
     if (m_n_seq > 1 && m_is_static == true) {
-        throw std::runtime_error("n_seq > 1 is not supported on NPU");
+        throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
     }
 }
 
@@ -371,18 +376,24 @@ void GgmlOvDecoder::add_extra_inputs() {
     // Extra inputs:
     // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
     //     see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
-    //     Not used for NPU.
     // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
 
-    auto create_1d_input = [this](const std::string & name, int64_t size) {
-        auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
-        param_node->set_friendly_name(name);
-        param_node->output(0).get_tensor().set_names({name});
-        m_model_extra_inputs[name] = param_node;
-
-        auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
-        *tensor->data<int64_t>() = size;
-        m_model_extra_input_values[name] = tensor;
+    auto create_1d_input = [this](const std::string & name, int64_t value) {
+        if (m_is_static) {
+            auto constant =
+                std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
+            constant->set_friendly_name(name);
+            m_model_extra_inputs[name] = constant;
+        } else {
+            auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
+            param_node->set_friendly_name(name);
+            param_node->output(0).get_tensor().set_names({name});
+            m_model_extra_inputs[name] = param_node;
+
+            auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
+            *tensor->data<int64_t>() = value;
+            m_model_extra_input_values[name] = tensor;
+        }
     };
 
     create_1d_input("attention_size", m_attention_size);
diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp
@@ -56,9 +56,7 @@ OutputVector translate_permute(const NodeContext & context) {
         int64_t n_seq = cache_shape[1].get_length();
 
         Output<Node> attention_size;
-        if (context.is_static()) {
-            attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
-        } else if (op_case == 2) {
+        if (op_case == 2) {
             attention_size = context.get_input("attention_size");
         } else {
             attention_size = context.get_input("attention_size_swa");
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -154,7 +154,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
     }
 
     for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {
-        params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
+        if (std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second)) {
+            params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
+        }
         (*tensor_map)[it.first] = it.second;
     }
 
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -129,27 +129,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
             ov_input_names_cache[cgraph] = ov_input_names;
             ov_output_names_cache[cgraph] = ov_output_names;
 
-            // // Set output tensors (for NPU) and kvcache i/o tensors once and for all
-            // // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it
-            // for (size_t i = 0; i < ov_output_names.size(); i++) {
-            //     auto output_name = ov_output_names[i];
-            //     if (is_static || output_name.find("cache") == 0) {
-            //         auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
-            //         infer_request->set_output_tensor(i, output_tensor);
-            //     }
-            // }
-            // for (size_t i = 0; i < ov_input_names.size(); i++) {
-            //     auto param_name = ov_input_names[i];
-            //     if (param_name.find("cache") == 0) {
-            //         ov::Tensor input_tensor;
-            //         if (is_static) {
-            //             input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
-            //         } else {
-            //             input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
-            //         }
-            //         infer_request->set_input_tensor(i, input_tensor);
-            //     }
-            // }
+            // Set output tensors (for NPU) and kvcache i/o tensors once and for all
+            // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU
+            if (is_static) {
+                for (size_t i = 0; i < ov_output_names.size(); i++) {
+                    auto output_name = ov_output_names[i];
+                    auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
+                    infer_request->set_output_tensor(i, output_tensor);
+                }
+                for (size_t i = 0; i < ov_input_names.size(); i++) {
+                    auto param_name = ov_input_names[i];
+                    if (param_name.find("cache") == 0) {
+                        auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
+                        infer_request->set_input_tensor(i, input_tensor);
+                    }
+                }
+            }
         }
     }
 
@@ -336,7 +331,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr<GgmlOvDecoder> ggml_decode
     const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
     const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
 
-    if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) {
+    if (param_name == "inp_pos" || param_name == "inp_tokens" ||
+        (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
         ov::Shape input_shape = {1, 1, 1, 1};
         ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
         // copy the j-th value from ggml_tensor

Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo`
`154`	`154`	`}`
`155`	`155`
`156`	`156`	`for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {`
`157`		`- params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));`
	`157`	`+ if (std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second)) {`
	`158`	`+ params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));`
	`159`	`+ }`
`158`	`160`	`(*tensor_map)[it.first] = it.second;`
`159`	`161`	`}`
`160`	`162`