Skip to content

Commit bbecac0

Browse files
committed
Fix NPU
1 parent 6be0146 commit bbecac0

4 files changed

Lines changed: 44 additions & 37 deletions

File tree

ggml/src/ggml-openvino/ggml-decoder.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,11 @@ void GgmlOvDecoder::set_llm_params() {
311311
} else {
312312
m_attention_size = mask->ne[0];
313313
}
314+
if (m_is_static) {
315+
m_attention_size = m_ctx_per_seq;
316+
m_attention_size_swa = m_ctx_per_seq_swa;
317+
m_token_len_per_seq = 1;
318+
}
314319

315320
} else if (node->op == GGML_OP_ROPE) {
316321
if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) {
@@ -330,7 +335,7 @@ void GgmlOvDecoder::set_llm_params() {
330335

331336
void GgmlOvDecoder::validate_cgraph() const {
332337
if (m_n_seq > 1 && m_is_static == true) {
333-
throw std::runtime_error("n_seq > 1 is not supported on NPU");
338+
throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
334339
}
335340
}
336341

@@ -371,18 +376,24 @@ void GgmlOvDecoder::add_extra_inputs() {
371376
// Extra inputs:
372377
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
373378
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
374-
// Not used for NPU.
375379
// 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
376380

377-
auto create_1d_input = [this](const std::string & name, int64_t size) {
378-
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
379-
param_node->set_friendly_name(name);
380-
param_node->output(0).get_tensor().set_names({name});
381-
m_model_extra_inputs[name] = param_node;
382-
383-
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
384-
*tensor->data<int64_t>() = size;
385-
m_model_extra_input_values[name] = tensor;
381+
auto create_1d_input = [this](const std::string & name, int64_t value) {
382+
if (m_is_static) {
383+
auto constant =
384+
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
385+
constant->set_friendly_name(name);
386+
m_model_extra_inputs[name] = constant;
387+
} else {
388+
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
389+
param_node->set_friendly_name(name);
390+
param_node->output(0).get_tensor().set_names({name});
391+
m_model_extra_inputs[name] = param_node;
392+
393+
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
394+
*tensor->data<int64_t>() = value;
395+
m_model_extra_input_values[name] = tensor;
396+
}
386397
};
387398

388399
create_1d_input("attention_size", m_attention_size);

ggml/src/ggml-openvino/openvino/op/permute.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,7 @@ OutputVector translate_permute(const NodeContext & context) {
5656
int64_t n_seq = cache_shape[1].get_length();
5757

5858
Output<Node> attention_size;
59-
if (context.is_static()) {
60-
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
61-
} else if (op_case == 2) {
59+
if (op_case == 2) {
6260
attention_size = context.get_input("attention_size");
6361
} else {
6462
attention_size = context.get_input("attention_size_swa");

ggml/src/ggml-openvino/openvino/translate_session.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
154154
}
155155

156156
for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {
157-
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
157+
if (std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second)) {
158+
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
159+
}
158160
(*tensor_map)[it.first] = it.second;
159161
}
160162

ggml/src/ggml-openvino/utils.cpp

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -129,27 +129,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
129129
ov_input_names_cache[cgraph] = ov_input_names;
130130
ov_output_names_cache[cgraph] = ov_output_names;
131131

132-
// // Set output tensors (for NPU) and kvcache i/o tensors once and for all
133-
// // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it
134-
// for (size_t i = 0; i < ov_output_names.size(); i++) {
135-
// auto output_name = ov_output_names[i];
136-
// if (is_static || output_name.find("cache") == 0) {
137-
// auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
138-
// infer_request->set_output_tensor(i, output_tensor);
139-
// }
140-
// }
141-
// for (size_t i = 0; i < ov_input_names.size(); i++) {
142-
// auto param_name = ov_input_names[i];
143-
// if (param_name.find("cache") == 0) {
144-
// ov::Tensor input_tensor;
145-
// if (is_static) {
146-
// input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
147-
// } else {
148-
// input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
149-
// }
150-
// infer_request->set_input_tensor(i, input_tensor);
151-
// }
152-
// }
132+
// Set output tensors (for NPU) and kvcache i/o tensors once and for all
133+
// Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU
134+
if (is_static) {
135+
for (size_t i = 0; i < ov_output_names.size(); i++) {
136+
auto output_name = ov_output_names[i];
137+
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
138+
infer_request->set_output_tensor(i, output_tensor);
139+
}
140+
for (size_t i = 0; i < ov_input_names.size(); i++) {
141+
auto param_name = ov_input_names[i];
142+
if (param_name.find("cache") == 0) {
143+
auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
144+
infer_request->set_input_tensor(i, input_tensor);
145+
}
146+
}
147+
}
153148
}
154149
}
155150

@@ -336,7 +331,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr<GgmlOvDecoder> ggml_decode
336331
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
337332
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
338333

339-
if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) {
334+
if (param_name == "inp_pos" || param_name == "inp_tokens" ||
335+
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
340336
ov::Shape input_shape = {1, 1, 1, 1};
341337
ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
342338
// copy the j-th value from ggml_tensor

0 commit comments

Comments
 (0)