diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index f48a87cdc516..940a927c7a54 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -81,13 +81,20 @@ def query_llm( stop_words_list=None, bad_words_list=None, no_repeat_ngram_size=None, - max_output_len=512, - top_k=1, - top_p=0.0, - temperature=1.0, + min_output_len=None, + max_output_len=None, + top_k=None, + top_p=None, + temperature=None, random_seed=None, task_id=None, lora_uids=None, + use_greedy: bool = None, + repetition_penalty: float = None, + add_BOS: bool = None, + all_probs: bool = None, + compute_logprob: bool = None, + end_strings=None, init_timeout=60.0, ): """ @@ -110,6 +117,9 @@ def query_llm( prompts = str_list2numpy(prompts) inputs = {"prompts": prompts} + if min_output_len is not None: + inputs["min_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_) + if max_output_len is not None: inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_) @@ -127,6 +137,7 @@ def query_llm( if stop_words_list is not None: inputs["stop_words_list"] = str_list2numpy(stop_words_list) + if bad_words_list is not None: inputs["bad_words_list"] = str_list2numpy(bad_words_list) @@ -141,12 +152,37 @@ def query_llm( lora_uids = np.char.encode(lora_uids, "utf-8") inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids) + if use_greedy is not None: + inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_) + + if repetition_penalty is not None: + inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single) + + if add_BOS is not None: + inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_) + + if all_probs is not None: + inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_) + + if compute_logprob is not None: + inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_) + + if end_strings is not None: + inputs["end_strings"] = str_list2numpy(end_strings) + with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: result_dict = client.infer_batch(**inputs) output_type = client.model_config.outputs[0].dtype if output_type == np.bytes_: - sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8") + if "outputs" in result_dict.keys(): + output = result_dict["outputs"] + elif "sentences" in result_dict.keys(): + output = result_dict["sentences"] + else: + return "Unknown output keyword." + + sentences = np.char.decode(output.astype("bytes"), "utf-8") return sentences else: return result_dict["outputs"] diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py index 835ff46dd5fe..d0854916cd38 100755 --- a/scripts/deploy/nlp/deploy_triton.py +++ b/scripts/deploy/nlp/deploy_triton.py @@ -31,13 +31,6 @@ def get_args(argv): description=f"Deploy nemo models to Triton", ) parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") - parser.add_argument( - "-dsn", - "--direct_serve_nemo", - default=False, - action='store_true', - help="Serve the nemo model directly instead of exporting to TRTLLM first. Will ignore other TRTLLM-specific arguments.", - ) parser.add_argument( "-ptnc", "--ptuning_nemo_checkpoint", @@ -147,6 +140,15 @@ def get_args(argv): action='store_true', help='Use TensorRT LLM C++ runtime', ) + parser.add_argument( + "-b", + '--backend', + nargs='?', + const=None, + default='TensorRT-LLM', + choices=['TensorRT-LLM', 'vLLM', 'In-Framework'], + help="Different options to deploy nemo model.", + ) parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") args = parser.parse_args(argv) @@ -261,7 +263,8 @@ def get_trtllm_deployable(args): def get_nemo_deployable(args): if args.nemo_checkpoint is None: - raise ValueError("Direct serve requires a .nemo checkpoint") + raise ValueError("In-Framework deployment requires a .nemo checkpoint") + return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) @@ -277,7 +280,15 @@ def nemo_deploy(argv): LOGGER.info("Logging level set to {}".format(loglevel)) LOGGER.info(args) - triton_deployable = get_nemo_deployable(args) if args.direct_serve_nemo else get_trtllm_deployable(args) + backend = args.backend.lower() + if backend == 'tensorrt-llm': + triton_deployable = get_trtllm_deployable(args) + elif backend == 'in-framework': + triton_deployable = get_nemo_deployable(args) + elif backend == 'vllm': + raise ValueError("vLLM will be supported in the next release.") + else: + raise ValueError("Backend: {0} is not supported.".format(backend)) try: nm = DeployPyTriton( diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py new file mode 100644 index 000000000000..f188b6e2bac8 --- /dev/null +++ b/tests/deploy/nemo_deploy.py @@ -0,0 +1,706 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import json +import shutil +import time +from pathlib import Path + +import torch + +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from tests.infer_data_path import get_infer_test_data + +run_export_tests = True +try: + from nemo.deploy import DeployPyTriton + from nemo.deploy.nlp import NemoQueryLLM + from nemo.export import TensorRTLLM +except Exception as e: + run_export_tests = False + + +def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=None): + # lambada dataset based accuracy test, which includes more than 5000 sentences. + # Use generated last token with original text's last token for accuracy comparison. + # If the generated last token start with the original token, trtllm_correct make an increment. + # It generates a CSV file for text comparison detail. + + if test_data_path is None: + raise Exception("test_data_path cannot be None.") + + trtllm_correct = 0 + trtllm_deployed_correct = 0 + trtllm_correct_relaxed = 0 + trtllm_deployed_correct_relaxed = 0 + all_expected_outputs = [] + all_trtllm_outputs = [] + + with open(test_data_path, 'r') as file: + records = json.load(file) + + eval_start = time.perf_counter() + for record in records: + prompt = record["text_before_last_word"] + expected_output = record["last_word"].strip().lower() + trtllm_output = model.forward( + input_texts=[prompt], + max_output_len=1, + top_k=1, + top_p=0, + temperature=0.1, + task_ids=task_ids, + lora_uids=lora_uids, + ) + trtllm_output = trtllm_output[0][0].strip().lower() + + all_expected_outputs.append(expected_output) + all_trtllm_outputs.append(trtllm_output) + + if expected_output == trtllm_output: + trtllm_correct += 1 + + if ( + expected_output == trtllm_output + or trtllm_output.startswith(expected_output) + or expected_output.startswith(trtllm_output) + ): + if len(trtllm_output) == 1 and len(expected_output) > 1: + continue + trtllm_correct_relaxed += 1 + + if nq is not None: + trtllm_deployed_output = nq.query_llm( + prompts=[prompt], + max_output_len=1, + top_k=1, + top_p=0, + temperature=0.1, + task_id=task_ids, + ) + trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower() + + if expected_output == trtllm_deployed_output: + trtllm_deployed_correct += 1 + + if ( + expected_output == trtllm_deployed_output + or trtllm_deployed_output.startswith(expected_output) + or expected_output.startswith(trtllm_deployed_output) + ): + if len(trtllm_deployed_output) == 1 and len(expected_output) > 1: + continue + trtllm_deployed_correct_relaxed += 1 + eval_end = time.perf_counter() + + trtllm_accuracy = trtllm_correct / len(all_expected_outputs) + trtllm_accuracy_relaxed = trtllm_correct_relaxed / len(all_expected_outputs) + + trtllm_deployed_accuracy = trtllm_deployed_correct / len(all_expected_outputs) + trtllm_deployed_accuracy_relaxed = trtllm_deployed_correct_relaxed / len(all_expected_outputs) + + evaluation_time = eval_end - eval_start + + return ( + trtllm_accuracy, + trtllm_accuracy_relaxed, + trtllm_deployed_accuracy, + trtllm_deployed_accuracy_relaxed, + evaluation_time, + ) + + +def run_in_framework_inference( + model_name, + prompt, + checkpoint_path, + n_gpu=1, + max_batch_size=None, + max_input_len=None, + max_output_len=None, +): + model = MegatronLLMDeployable(checkpoint_path, n_gpu) + nm = DeployPyTriton( + model=model, + triton_model_name=model_name, + port=8000, + ) + nm.deploy() + nm.run() + nq = NemoQueryLLM(url="localhost:8000", model_name=model_name) + + output_deployed = nq.query_llm( + prompts=prompt, + ) + + print("Output: ", output_deployed) + + nm.stop() + + return None, None, None, None, None + + +def run_trt_llm_inference( + model_name, + model_type, + prompt, + checkpoint_path, + trt_llm_model_dir, + n_gpu=1, + max_batch_size=8, + use_embedding_sharing=False, + max_input_len=128, + max_output_len=128, + ptuning=False, + p_tuning_checkpoint=None, + lora=False, + lora_checkpoint=None, + tp_size=None, + pp_size=None, + top_k=1, + top_p=0.0, + temperature=1.0, + run_accuracy=False, + debug=True, + streaming=False, + stop_words_list=None, + test_deployment=False, + test_data_path=None, + backend="TensorRT-LLM", + save_trt_engine=False, +): + if Path(checkpoint_path).exists(): + if n_gpu > torch.cuda.device_count(): + print( + "Path: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}".format( + checkpoint_path, model_name, n_gpu, torch.cuda.device_count() + ) + ) + return None, None, None, None, None + + Path(trt_llm_model_dir).mkdir(parents=True, exist_ok=True) + + if debug: + print("") + print("") + print( + "################################################## NEW TEST ##################################################" + ) + print("") + + print("Path: {0} and model: {1} with {2} gpus will be tested".format(checkpoint_path, model_name, n_gpu)) + + prompt_embeddings_checkpoint_path = None + task_ids = None + max_prompt_embedding_table_size = 0 + + if ptuning: + if Path(p_tuning_checkpoint).exists(): + prompt_embeddings_checkpoint_path = p_tuning_checkpoint + max_prompt_embedding_table_size = 8192 + task_ids = ["0"] + if debug: + print("---- PTuning enabled.") + else: + print("---- PTuning could not be enabled and skipping the test.") + return None, None, None, None, None + + lora_ckpt_list = None + lora_uids = None + use_lora_plugin = None + lora_target_modules = None + + if lora: + if Path(lora_checkpoint).exists(): + lora_ckpt_list = [lora_checkpoint] + lora_uids = ["0", "-1", "0"] + use_lora_plugin = "bfloat16" + lora_target_modules = ["attn_qkv"] + if debug: + print("---- LoRA enabled.") + else: + print("---- LoRA could not be enabled and skipping the test.") + return None, None, None, None, None + + trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False) + + trt_llm_exporter.export( + nemo_checkpoint_path=checkpoint_path, + model_type=model_type, + n_gpus=n_gpu, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + max_input_len=max_input_len, + max_output_len=max_output_len, + max_batch_size=max_batch_size, + max_prompt_embedding_table_size=max_prompt_embedding_table_size, + use_lora_plugin=use_lora_plugin, + lora_target_modules=lora_target_modules, + max_num_tokens=int(max_input_len * max_batch_size * 0.2), + opt_num_tokens=60, + use_embedding_sharing=use_embedding_sharing, + save_nemo_model_config=True, + ) + + if ptuning: + trt_llm_exporter.add_prompt_table( + task_name="0", + prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, + ) + + output = trt_llm_exporter.forward( + input_texts=prompt, + max_output_len=max_output_len, + top_k=top_k, + top_p=top_p, + temperature=temperature, + task_ids=task_ids, + lora_uids=lora_uids, + streaming=streaming, + stop_words_list=stop_words_list, + ) + + if not use_lora_plugin and not ptuning: + test_cpp_runtime( + engine_path=trt_llm_model_dir, + prompt=prompt, + max_output_len=max_output_len, + debug=True, + ) + + nq = None + nm = None + output_deployed = "" + if test_deployment: + nm = DeployPyTriton( + model=trt_llm_exporter, + triton_model_name=model_name, + port=8000, + ) + nm.deploy() + nm.run() + nq = NemoQueryLLM(url="localhost:8000", model_name=model_name) + + output_deployed = nq.query_llm( + prompts=prompt, + max_output_len=max_output_len, + top_k=1, + top_p=0.0, + temperature=1.0, + lora_uids=lora_uids, + ) + + if debug: + print("") + print("--- Prompt: ", prompt) + print("") + print("--- Output: ", output) + print("") + print("") + print("--- Output deployed: ", output_deployed) + print("") + + if run_accuracy: + print("Start model accuracy testing ...") + result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path) + if test_deployment: + nm.stop() + + if not save_trt_engine: + shutil.rmtree(trt_llm_model_dir) + return result + + if test_deployment: + nm.stop() + + if not save_trt_engine: + shutil.rmtree(trt_llm_model_dir) + + return None, None, None, None, None + else: + raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path)) + + +def test_cpp_runtime( + engine_path, + prompt, + max_output_len, + debug, +): + trt_llm_exporter = TensorRTLLM(engine_path, load_model=True) + output = trt_llm_exporter.forward( + input_texts=prompt, + max_output_len=max_output_len, + top_k=1, + top_p=0.0, + temperature=1.0, + ) + + if debug: + print("") + print("--- Output deployed with cpp runtime: ", output) + print("") + + +def run_existing_checkpoints( + model_name, + n_gpus, + tp_size=None, + pp_size=None, + ptuning=False, + lora=False, + streaming=False, + run_accuracy=False, + test_deployment=False, + stop_words_list=None, + test_data_path=None, + backend="tensorrt-llm", + save_trt_engine=False, +): + if n_gpus > torch.cuda.device_count(): + print("Skipping the test due to not enough number of GPUs") + return None, None, None, None, None + + test_data = get_infer_test_data() + if not (model_name in test_data.keys()): + raise Exception("Model {0} is not supported.".format(model_name)) + + model_info = test_data[model_name] + + if n_gpus < model_info["min_gpus"]: + print("Min n_gpus for this model is {0}".format(n_gpus)) + return None, None, None, None, None + + p_tuning_checkpoint = None + if ptuning: + if "p_tuning_checkpoint" in model_info.keys(): + p_tuning_checkpoint = model_info["p_tuning_checkpoint"] + else: + raise Exception("There is not ptuning checkpoint path defined.") + + lora_checkpoint = None + if lora: + if "lora_checkpoint" in model_info.keys(): + lora_checkpoint = model_info["lora_checkpoint"] + else: + raise Exception("There is not lora checkpoint path defined.") + + if model_info["model_type"] == "gemma": + print("*********************") + use_embedding_sharing = True + else: + use_embedding_sharing = False + + if backend == "in-framework": + return run_in_framework_inference( + model_name=model_name, + prompt=model_info["prompt_template"], + checkpoint_path=model_info["checkpoint"], + max_batch_size=model_info["max_batch_size"], + max_input_len=None, + max_output_len=model_info["max_output_len"], + ) + else: + return run_trt_llm_inference( + model_name=model_name, + model_type=model_info["model_type"], + prompt=model_info["prompt_template"], + checkpoint_path=model_info["checkpoint"], + trt_llm_model_dir=model_info["trt_llm_model_dir"], + n_gpu=n_gpus, + max_batch_size=model_info["max_batch_size"], + use_embedding_sharing=use_embedding_sharing, + max_input_len=512, + max_output_len=model_info["max_output_len"], + ptuning=ptuning, + p_tuning_checkpoint=p_tuning_checkpoint, + lora=lora, + lora_checkpoint=lora_checkpoint, + tp_size=tp_size, + pp_size=pp_size, + top_k=1, + top_p=0.0, + temperature=1.0, + run_accuracy=run_accuracy, + debug=True, + streaming=streaming, + stop_words_list=stop_words_list, + test_deployment=test_deployment, + test_data_path=test_data_path, + save_trt_engine=save_trt_engine, + ) + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description=f"Deploy nemo models to Triton and benchmark the models", + ) + + parser.add_argument( + "--model_name", + type=str, + required=True, + ) + parser.add_argument( + "--existing_test_models", + default=False, + action='store_true', + ) + parser.add_argument( + "--model_type", + type=str, + required=False, + ) + parser.add_argument( + "--min_gpus", + type=int, + default=1, + required=True, + ) + parser.add_argument( + "--max_gpus", + type=int, + ) + parser.add_argument( + "--checkpoint_dir", + type=str, + default="/tmp/nemo_checkpoint/", + required=False, + ) + parser.add_argument( + "--trt_llm_model_dir", + type=str, + ) + parser.add_argument( + "--max_batch_size", + type=int, + default=8, + ) + parser.add_argument( + "--max_input_len", + type=int, + default=256, + ) + parser.add_argument( + "--max_output_len", + type=int, + default=128, + ) + parser.add_argument( + "--p_tuning_checkpoint", + type=str, + ) + parser.add_argument( + "--ptuning", + default=False, + action='store_true', + ) + parser.add_argument( + "--lora_checkpoint", + type=str, + ) + parser.add_argument( + "--lora", + default=False, + action='store_true', + ) + parser.add_argument( + "--tp_size", + type=int, + ) + parser.add_argument( + "--pp_size", + type=int, + ) + parser.add_argument( + "--top_k", + type=int, + default=1, + ) + parser.add_argument( + "--top_p", + type=float, + default=0.0, + ) + parser.add_argument( + "--temperature", + type=float, + default=1.0, + ) + parser.add_argument( + "--run_accuracy", + type=str, + default="False", + ) + parser.add_argument("--streaming", default=False, action="store_true") + parser.add_argument( + "--test_deployment", + type=str, + default="False", + ) + parser.add_argument( + "--debug", + default=False, + action='store_true', + ) + parser.add_argument( + "--ci_upload_test_results_to_cloud", + default=False, + action='store_true', + ) + parser.add_argument( + "--test_data_path", + type=str, + default=None, + ) + parser.add_argument( + "-b", + '--backend', + nargs='?', + const=None, + default='TensorRT-LLM', + choices=['TensorRT-LLM', 'vLLM', 'In-Framework'], + help="Different options to deploy nemo model.", + ) + parser.add_argument( + "--save_trt_engine", + type=str, + default="False", + ) + + return parser.parse_args() + + +def run_inference_tests(args): + if args.test_deployment == "True": + args.test_deployment = True + else: + args.test_deployment = False + + if args.save_trt_engine == "True": + args.save_trt_engine = True + else: + args.save_trt_engine = False + + if args.run_accuracy == "True": + args.run_accuracy = True + else: + args.run_accuracy = False + + if args.run_accuracy: + if args.test_data_path is None: + raise Exception("test_data_path param cannot be None.") + + result_dic = {} + + if args.existing_test_models: + n_gpus = args.min_gpus + if args.max_gpus is None: + args.max_gpus = args.min_gpus + + while n_gpus <= args.max_gpus: + result_dic[n_gpus] = run_existing_checkpoints( + model_name=args.model_name, + n_gpus=n_gpus, + ptuning=args.ptuning, + lora=args.lora, + tp_size=args.tp_size, + pp_size=args.pp_size, + streaming=args.streaming, + test_deployment=args.test_deployment, + run_accuracy=args.run_accuracy, + test_data_path=args.test_data_path, + backend=args.backend.lower(), + save_trt_engine=args.save_trt_engine, + ) + + n_gpus = n_gpus * 2 + else: + prompt_template = ["The capital of France is", "Largest animal in the sea is"] + n_gpus = args.min_gpus + if args.max_gpus is None: + args.max_gpus = args.min_gpus + + while n_gpus <= args.max_gpus: + if args.backend.lower() == "tensorrt-llm": + result_dic[n_gpus] = run_trt_llm_inference( + model_name=args.model_name, + model_type=args.model_type, + prompt=prompt_template, + checkpoint_path=args.checkpoint_dir, + trt_llm_model_dir=args.trt_llm_model_dir, + n_gpu=n_gpus, + max_batch_size=args.max_batch_size, + max_input_len=args.max_input_len, + max_output_len=args.max_output_len, + ptuning=args.ptuning, + p_tuning_checkpoint=args.p_tuning_checkpoint, + lora=args.lora, + lora_checkpoint=args.lora_checkpoint, + tp_size=args.tp_size, + pp_size=args.pp_size, + top_k=args.top_k, + top_p=args.top_p, + temperature=args.temperature, + run_accuracy=args.run_accuracy, + debug=args.debug, + streaming=args.streaming, + test_deployment=args.test_deployment, + test_data_path=args.test_data_path, + save_trt_engine=args.save_trt_engine, + ) + else: + result_dic[n_gpus] = run_in_framework_inference( + model_name=args.model_name, + prompt=prompt_template, + checkpoint_path=args.checkpoint_dir, + n_gpu=n_gpus, + max_batch_size=args.max_batch_size, + max_input_len=args.max_input_len, + max_output_len=args.max_output_len, + ) + + n_gpus = n_gpus * 2 + + test_result = "PASS" + print_separator = False + print("============= Test Summary ============") + for i, results in result_dic.items(): + if not results[0] is None and not results[1] is None: + if print_separator: + print("---------------------------------------") + print( + "Number of GPUS: {}\n" + "Model Accuracy: {:.4f}\n" + "Relaxed Model Accuracy: {:.4f}\n" + "Deployed Model Accuracy: {:.4f}\n" + "Deployed Relaxed Model Accuracy: {:.4f}\n" + "Evaluation Time [s]: {:.2f}".format(i, *results) + ) + print_separator = True + if results[1] < 0.5: + test_result = "FAIL" + + print("=======================================") + print("TEST: " + test_result) + if test_result == "FAIL": + raise Exception("Model accuracy is below 0.5") + + +if __name__ == '__main__': + args = get_args() + run_inference_tests(args) diff --git a/tests/deploy/pytriton_deploy.py b/tests/deploy/pytriton_deploy.py deleted file mode 100644 index 3b722d2d7fec..000000000000 --- a/tests/deploy/pytriton_deploy.py +++ /dev/null @@ -1,136 +0,0 @@ -import argparse - -import numpy as np -from pytriton.client import ModelClient - -from nemo.deploy.deploy_pytriton import DeployPyTriton -from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable -from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMPyTorch - - -def test_triton_deployable(args): - megatron_deployable = MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) - - prompts = ["What is the biggest planet in the solar system?", "What is the fastest steam locomotive in history?"] - url = "localhost:8000" - model_name = args.model_name - init_timeout = 600.0 - - nm = DeployPyTriton( - model=megatron_deployable, - triton_model_name=model_name, - triton_model_version=1, - max_batch_size=8, - port=8000, - address="0.0.0.0", - streaming=False, - ) - nm.deploy() - nm.run() - - # run once with NemoTritonQueryLLMPyTorch - nemo_triton_query = NemoTritonQueryLLMPyTorch(url, model_name) - - result_dict = nemo_triton_query.query_llm( - prompts, - top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, - max_length=args.max_output_token, - init_timeout=init_timeout, - ) - print("NemoTritonQueryLLMPyTriton result:") - print(result_dict) - - # run once with ModelClient, the results should be identical - str_ndarray = np.array(prompts)[..., np.newaxis] - prompts = np.char.encode(str_ndarray, "utf-8") - max_output_token = np.full(prompts.shape, args.max_output_token, dtype=np.int_) - top_k = np.full(prompts.shape, args.top_k, dtype=np.int_) - top_p = np.full(prompts.shape, args.top_p, dtype=np.single) - temperature = np.full(prompts.shape, args.temperature, dtype=np.single) - - with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: - result_dict = client.infer_batch( - prompts=prompts, - max_length=max_output_token, - top_k=top_k, - top_p=top_p, - temperature=temperature, - ) - print("ModelClient result:") - print(result_dict) - - # test logprobs generation - # right now we don't support batches where output data is inconsistent in size, so submitting each prompt individually - all_probs = np.full(prompts.shape, True, dtype=np.bool_) - compute_logprob = np.full(prompts.shape, True, dtype=np.bool_) - with ModelClient(url, model_name, init_timeout_s=init_timeout) as client: - logprob_results = client.infer_batch( - prompts=prompts, - max_length=max_output_token, - top_k=top_k, - top_p=top_p, - temperature=temperature, - all_probs=all_probs, - compute_logprob=compute_logprob, - ) - print("Logprob results:") - print(logprob_results) - - nm.stop() - - -def get_args(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description=f"Deploy nemo models to Triton and benchmark the models", - ) - - parser.add_argument( - "--model_name", - type=str, - required=True, - ) - parser.add_argument( - "--num_gpus", - type=int, - default=1, - ) - parser.add_argument( - "--nemo_checkpoint", - type=str, - required=True, - ) - parser.add_argument( - "--max_batch_size", - type=int, - default=8, - ) - parser.add_argument( - "--max_output_token", - type=int, - default=128, - ) - parser.add_argument( - "--top_k", - type=int, - default=1, - ) - parser.add_argument( - "--top_p", - type=float, - default=0.0, - ) - parser.add_argument( - "--temperature", - type=float, - default=1.0, - ) - - return parser.parse_args() - - -if __name__ == '__main__': - args = get_args() - test_triton_deployable(args)