@@ -129,27 +129,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
129129 ov_input_names_cache[cgraph] = ov_input_names;
130130 ov_output_names_cache[cgraph] = ov_output_names;
131131
132- // // Set output tensors (for NPU) and kvcache i/o tensors once and for all
133- // // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it
134- // for (size_t i = 0; i < ov_output_names.size(); i++) {
135- // auto output_name = ov_output_names[i];
136- // if (is_static || output_name.find("cache") == 0) {
137- // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
138- // infer_request->set_output_tensor(i, output_tensor);
139- // }
140- // }
141- // for (size_t i = 0; i < ov_input_names.size(); i++) {
142- // auto param_name = ov_input_names[i];
143- // if (param_name.find("cache") == 0) {
144- // ov::Tensor input_tensor;
145- // if (is_static) {
146- // input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
147- // } else {
148- // input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
149- // }
150- // infer_request->set_input_tensor(i, input_tensor);
151- // }
152- // }
132+ // Set output tensors (for NPU) and kvcache i/o tensors once and for all
133+ // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU
134+ if (is_static) {
135+ for (size_t i = 0 ; i < ov_output_names.size (); i++) {
136+ auto output_name = ov_output_names[i];
137+ auto output_tensor = get_ov_output_tensor (ggml_decoder, ov_output_names[i]);
138+ infer_request->set_output_tensor (i, output_tensor);
139+ }
140+ for (size_t i = 0 ; i < ov_input_names.size (); i++) {
141+ auto param_name = ov_input_names[i];
142+ if (param_name.find (" cache" ) == 0 ) {
143+ auto input_tensor = get_ov_input_tensor_static (ggml_decoder, param_name, 0 , 0 );
144+ infer_request->set_input_tensor (i, input_tensor);
145+ }
146+ }
147+ }
153148 }
154149 }
155150
@@ -336,7 +331,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr<GgmlOvDecoder> ggml_decode
336331 const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor (param_name);
337332 const auto * op = ggml_decoder->get_tensor_used_op (ggml_tensor);
338333
339- if (param_name == " inp_pos" || param_name == " inp_tokens" || op->op == GGML_OP_SET_ROWS) {
334+ if (param_name == " inp_pos" || param_name == " inp_tokens" ||
335+ (op->op == GGML_OP_SET_ROWS && op->src [1 ] == ggml_tensor)) {
340336 ov::Shape input_shape = {1 , 1 , 1 , 1 };
341337 ov::Tensor input_tensor (ggml_decoder->get_input_type (param_name), input_shape);
342338 // copy the j-th value from ggml_tensor
0 commit comments