diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index f48a87cdc516..940a927c7a54 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -81,13 +81,20 @@ def query_llm(
         stop_words_list=None,
         bad_words_list=None,
         no_repeat_ngram_size=None,
-        max_output_len=512,
-        top_k=1,
-        top_p=0.0,
-        temperature=1.0,
+        min_output_len=None,
+        max_output_len=None,
+        top_k=None,
+        top_p=None,
+        temperature=None,
         random_seed=None,
         task_id=None,
         lora_uids=None,
+        use_greedy: bool = None,
+        repetition_penalty: float = None,
+        add_BOS: bool = None,
+        all_probs: bool = None,
+        compute_logprob: bool = None,
+        end_strings=None,
         init_timeout=60.0,
     ):
         """
@@ -110,6 +117,9 @@ def query_llm(
         prompts = str_list2numpy(prompts)
         inputs = {"prompts": prompts}
 
+        if min_output_len is not None:
+            inputs["min_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)
+
         if max_output_len is not None:
             inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)
 
@@ -127,6 +137,7 @@ def query_llm(
 
         if stop_words_list is not None:
             inputs["stop_words_list"] = str_list2numpy(stop_words_list)
+
         if bad_words_list is not None:
             inputs["bad_words_list"] = str_list2numpy(bad_words_list)
 
@@ -141,12 +152,37 @@ def query_llm(
             lora_uids = np.char.encode(lora_uids, "utf-8")
             inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
 
+        if use_greedy is not None:
+            inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_)
+
+        if repetition_penalty is not None:
+            inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single)
+
+        if add_BOS is not None:
+            inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_)
+
+        if all_probs is not None:
+            inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_)
+
+        if compute_logprob is not None:
+            inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_)
+
+        if end_strings is not None:
+            inputs["end_strings"] = str_list2numpy(end_strings)
+
         with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
             result_dict = client.infer_batch(**inputs)
             output_type = client.model_config.outputs[0].dtype
 
             if output_type == np.bytes_:
-                sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8")
+                if "outputs" in result_dict.keys():
+                    output = result_dict["outputs"]
+                elif "sentences" in result_dict.keys():
+                    output = result_dict["sentences"]
+                else:
+                    return "Unknown output keyword."
+
+                sentences = np.char.decode(output.astype("bytes"), "utf-8")
                 return sentences
             else:
                 return result_dict["outputs"]
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
index 835ff46dd5fe..d0854916cd38 100755
--- a/scripts/deploy/nlp/deploy_triton.py
+++ b/scripts/deploy/nlp/deploy_triton.py
@@ -31,13 +31,6 @@ def get_args(argv):
         description=f"Deploy nemo models to Triton",
     )
     parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
-    parser.add_argument(
-        "-dsn",
-        "--direct_serve_nemo",
-        default=False,
-        action='store_true',
-        help="Serve the nemo model directly instead of exporting to TRTLLM first. Will ignore other TRTLLM-specific arguments.",
-    )
     parser.add_argument(
         "-ptnc",
         "--ptuning_nemo_checkpoint",
@@ -147,6 +140,15 @@ def get_args(argv):
         action='store_true',
         help='Use TensorRT LLM C++ runtime',
     )
+    parser.add_argument(
+        "-b",
+        '--backend',
+        nargs='?',
+        const=None,
+        default='TensorRT-LLM',
+        choices=['TensorRT-LLM', 'vLLM', 'In-Framework'],
+        help="Different options to deploy nemo model.",
+    )
     parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
 
     args = parser.parse_args(argv)
@@ -261,7 +263,8 @@ def get_trtllm_deployable(args):
 
 def get_nemo_deployable(args):
     if args.nemo_checkpoint is None:
-        raise ValueError("Direct serve requires a .nemo checkpoint")
+        raise ValueError("In-Framework deployment requires a .nemo checkpoint")
+
     return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus)
 
 
@@ -277,7 +280,15 @@ def nemo_deploy(argv):
     LOGGER.info("Logging level set to {}".format(loglevel))
     LOGGER.info(args)
 
-    triton_deployable = get_nemo_deployable(args) if args.direct_serve_nemo else get_trtllm_deployable(args)
+    backend = args.backend.lower()
+    if backend == 'tensorrt-llm':
+        triton_deployable = get_trtllm_deployable(args)
+    elif backend == 'in-framework':
+        triton_deployable = get_nemo_deployable(args)
+    elif backend == 'vllm':
+        raise ValueError("vLLM will be supported in the next release.")
+    else:
+        raise ValueError("Backend: {0} is not supported.".format(backend))
 
     try:
         nm = DeployPyTriton(
diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py
new file mode 100644
index 000000000000..f188b6e2bac8
--- /dev/null
+++ b/tests/deploy/nemo_deploy.py
@@ -0,0 +1,706 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import json
+import shutil
+import time
+from pathlib import Path
+
+import torch
+
+from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
+from tests.infer_data_path import get_infer_test_data
+
+run_export_tests = True
+try:
+    from nemo.deploy import DeployPyTriton
+    from nemo.deploy.nlp import NemoQueryLLM
+    from nemo.export import TensorRTLLM
+except Exception as e:
+    run_export_tests = False
+
+
+def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=None):
+    # lambada dataset based accuracy test, which includes more than 5000 sentences.
+    # Use generated last token with original text's last token for accuracy comparison.
+    # If the generated last token start with the original token, trtllm_correct make an increment.
+    # It generates a CSV file for text comparison detail.
+
+    if test_data_path is None:
+        raise Exception("test_data_path cannot be None.")
+
+    trtllm_correct = 0
+    trtllm_deployed_correct = 0
+    trtllm_correct_relaxed = 0
+    trtllm_deployed_correct_relaxed = 0
+    all_expected_outputs = []
+    all_trtllm_outputs = []
+
+    with open(test_data_path, 'r') as file:
+        records = json.load(file)
+
+        eval_start = time.perf_counter()
+        for record in records:
+            prompt = record["text_before_last_word"]
+            expected_output = record["last_word"].strip().lower()
+            trtllm_output = model.forward(
+                input_texts=[prompt],
+                max_output_len=1,
+                top_k=1,
+                top_p=0,
+                temperature=0.1,
+                task_ids=task_ids,
+                lora_uids=lora_uids,
+            )
+            trtllm_output = trtllm_output[0][0].strip().lower()
+
+            all_expected_outputs.append(expected_output)
+            all_trtllm_outputs.append(trtllm_output)
+
+            if expected_output == trtllm_output:
+                trtllm_correct += 1
+
+            if (
+                expected_output == trtllm_output
+                or trtllm_output.startswith(expected_output)
+                or expected_output.startswith(trtllm_output)
+            ):
+                if len(trtllm_output) == 1 and len(expected_output) > 1:
+                    continue
+                trtllm_correct_relaxed += 1
+
+            if nq is not None:
+                trtllm_deployed_output = nq.query_llm(
+                    prompts=[prompt],
+                    max_output_len=1,
+                    top_k=1,
+                    top_p=0,
+                    temperature=0.1,
+                    task_id=task_ids,
+                )
+                trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower()
+
+                if expected_output == trtllm_deployed_output:
+                    trtllm_deployed_correct += 1
+
+                if (
+                    expected_output == trtllm_deployed_output
+                    or trtllm_deployed_output.startswith(expected_output)
+                    or expected_output.startswith(trtllm_deployed_output)
+                ):
+                    if len(trtllm_deployed_output) == 1 and len(expected_output) > 1:
+                        continue
+                    trtllm_deployed_correct_relaxed += 1
+        eval_end = time.perf_counter()
+
+    trtllm_accuracy = trtllm_correct / len(all_expected_outputs)
+    trtllm_accuracy_relaxed = trtllm_correct_relaxed / len(all_expected_outputs)
+
+    trtllm_deployed_accuracy = trtllm_deployed_correct / len(all_expected_outputs)
+    trtllm_deployed_accuracy_relaxed = trtllm_deployed_correct_relaxed / len(all_expected_outputs)
+
+    evaluation_time = eval_end - eval_start
+
+    return (
+        trtllm_accuracy,
+        trtllm_accuracy_relaxed,
+        trtllm_deployed_accuracy,
+        trtllm_deployed_accuracy_relaxed,
+        evaluation_time,
+    )
+
+
+def run_in_framework_inference(
+    model_name,
+    prompt,
+    checkpoint_path,
+    n_gpu=1,
+    max_batch_size=None,
+    max_input_len=None,
+    max_output_len=None,
+):
+    model = MegatronLLMDeployable(checkpoint_path, n_gpu)
+    nm = DeployPyTriton(
+        model=model,
+        triton_model_name=model_name,
+        port=8000,
+    )
+    nm.deploy()
+    nm.run()
+    nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
+
+    output_deployed = nq.query_llm(
+        prompts=prompt,
+    )
+
+    print("Output: ", output_deployed)
+
+    nm.stop()
+
+    return None, None, None, None, None
+
+
+def run_trt_llm_inference(
+    model_name,
+    model_type,
+    prompt,
+    checkpoint_path,
+    trt_llm_model_dir,
+    n_gpu=1,
+    max_batch_size=8,
+    use_embedding_sharing=False,
+    max_input_len=128,
+    max_output_len=128,
+    ptuning=False,
+    p_tuning_checkpoint=None,
+    lora=False,
+    lora_checkpoint=None,
+    tp_size=None,
+    pp_size=None,
+    top_k=1,
+    top_p=0.0,
+    temperature=1.0,
+    run_accuracy=False,
+    debug=True,
+    streaming=False,
+    stop_words_list=None,
+    test_deployment=False,
+    test_data_path=None,
+    backend="TensorRT-LLM",
+    save_trt_engine=False,
+):
+    if Path(checkpoint_path).exists():
+        if n_gpu > torch.cuda.device_count():
+            print(
+                "Path: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}".format(
+                    checkpoint_path, model_name, n_gpu, torch.cuda.device_count()
+                )
+            )
+            return None, None, None, None, None
+
+        Path(trt_llm_model_dir).mkdir(parents=True, exist_ok=True)
+
+        if debug:
+            print("")
+            print("")
+            print(
+                "################################################## NEW TEST ##################################################"
+            )
+            print("")
+
+            print("Path: {0} and model: {1} with {2} gpus will be tested".format(checkpoint_path, model_name, n_gpu))
+
+        prompt_embeddings_checkpoint_path = None
+        task_ids = None
+        max_prompt_embedding_table_size = 0
+
+        if ptuning:
+            if Path(p_tuning_checkpoint).exists():
+                prompt_embeddings_checkpoint_path = p_tuning_checkpoint
+                max_prompt_embedding_table_size = 8192
+                task_ids = ["0"]
+                if debug:
+                    print("---- PTuning enabled.")
+            else:
+                print("---- PTuning could not be enabled and skipping the test.")
+                return None, None, None, None, None
+
+        lora_ckpt_list = None
+        lora_uids = None
+        use_lora_plugin = None
+        lora_target_modules = None
+
+        if lora:
+            if Path(lora_checkpoint).exists():
+                lora_ckpt_list = [lora_checkpoint]
+                lora_uids = ["0", "-1", "0"]
+                use_lora_plugin = "bfloat16"
+                lora_target_modules = ["attn_qkv"]
+                if debug:
+                    print("---- LoRA enabled.")
+            else:
+                print("---- LoRA could not be enabled and skipping the test.")
+                return None, None, None, None, None
+
+        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
+
+        trt_llm_exporter.export(
+            nemo_checkpoint_path=checkpoint_path,
+            model_type=model_type,
+            n_gpus=n_gpu,
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            max_input_len=max_input_len,
+            max_output_len=max_output_len,
+            max_batch_size=max_batch_size,
+            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            max_num_tokens=int(max_input_len * max_batch_size * 0.2),
+            opt_num_tokens=60,
+            use_embedding_sharing=use_embedding_sharing,
+            save_nemo_model_config=True,
+        )
+
+        if ptuning:
+            trt_llm_exporter.add_prompt_table(
+                task_name="0",
+                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
+            )
+
+        output = trt_llm_exporter.forward(
+            input_texts=prompt,
+            max_output_len=max_output_len,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            task_ids=task_ids,
+            lora_uids=lora_uids,
+            streaming=streaming,
+            stop_words_list=stop_words_list,
+        )
+
+        if not use_lora_plugin and not ptuning:
+            test_cpp_runtime(
+                engine_path=trt_llm_model_dir,
+                prompt=prompt,
+                max_output_len=max_output_len,
+                debug=True,
+            )
+
+        nq = None
+        nm = None
+        output_deployed = ""
+        if test_deployment:
+            nm = DeployPyTriton(
+                model=trt_llm_exporter,
+                triton_model_name=model_name,
+                port=8000,
+            )
+            nm.deploy()
+            nm.run()
+            nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
+
+            output_deployed = nq.query_llm(
+                prompts=prompt,
+                max_output_len=max_output_len,
+                top_k=1,
+                top_p=0.0,
+                temperature=1.0,
+                lora_uids=lora_uids,
+            )
+
+        if debug:
+            print("")
+            print("--- Prompt: ", prompt)
+            print("")
+            print("--- Output: ", output)
+            print("")
+            print("")
+            print("--- Output deployed: ", output_deployed)
+            print("")
+
+        if run_accuracy:
+            print("Start model accuracy testing ...")
+            result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
+            if test_deployment:
+                nm.stop()
+
+            if not save_trt_engine:
+                shutil.rmtree(trt_llm_model_dir)
+            return result
+
+        if test_deployment:
+            nm.stop()
+
+        if not save_trt_engine:
+            shutil.rmtree(trt_llm_model_dir)
+
+        return None, None, None, None, None
+    else:
+        raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
+
+
+def test_cpp_runtime(
+    engine_path,
+    prompt,
+    max_output_len,
+    debug,
+):
+    trt_llm_exporter = TensorRTLLM(engine_path, load_model=True)
+    output = trt_llm_exporter.forward(
+        input_texts=prompt,
+        max_output_len=max_output_len,
+        top_k=1,
+        top_p=0.0,
+        temperature=1.0,
+    )
+
+    if debug:
+        print("")
+        print("--- Output deployed with cpp runtime: ", output)
+        print("")
+
+
+def run_existing_checkpoints(
+    model_name,
+    n_gpus,
+    tp_size=None,
+    pp_size=None,
+    ptuning=False,
+    lora=False,
+    streaming=False,
+    run_accuracy=False,
+    test_deployment=False,
+    stop_words_list=None,
+    test_data_path=None,
+    backend="tensorrt-llm",
+    save_trt_engine=False,
+):
+    if n_gpus > torch.cuda.device_count():
+        print("Skipping the test due to not enough number of GPUs")
+        return None, None, None, None, None
+
+    test_data = get_infer_test_data()
+    if not (model_name in test_data.keys()):
+        raise Exception("Model {0} is not supported.".format(model_name))
+
+    model_info = test_data[model_name]
+
+    if n_gpus < model_info["min_gpus"]:
+        print("Min n_gpus for this model is {0}".format(n_gpus))
+        return None, None, None, None, None
+
+    p_tuning_checkpoint = None
+    if ptuning:
+        if "p_tuning_checkpoint" in model_info.keys():
+            p_tuning_checkpoint = model_info["p_tuning_checkpoint"]
+        else:
+            raise Exception("There is not ptuning checkpoint path defined.")
+
+    lora_checkpoint = None
+    if lora:
+        if "lora_checkpoint" in model_info.keys():
+            lora_checkpoint = model_info["lora_checkpoint"]
+        else:
+            raise Exception("There is not lora checkpoint path defined.")
+
+    if model_info["model_type"] == "gemma":
+        print("*********************")
+        use_embedding_sharing = True
+    else:
+        use_embedding_sharing = False
+
+    if backend == "in-framework":
+        return run_in_framework_inference(
+            model_name=model_name,
+            prompt=model_info["prompt_template"],
+            checkpoint_path=model_info["checkpoint"],
+            max_batch_size=model_info["max_batch_size"],
+            max_input_len=None,
+            max_output_len=model_info["max_output_len"],
+        )
+    else:
+        return run_trt_llm_inference(
+            model_name=model_name,
+            model_type=model_info["model_type"],
+            prompt=model_info["prompt_template"],
+            checkpoint_path=model_info["checkpoint"],
+            trt_llm_model_dir=model_info["trt_llm_model_dir"],
+            n_gpu=n_gpus,
+            max_batch_size=model_info["max_batch_size"],
+            use_embedding_sharing=use_embedding_sharing,
+            max_input_len=512,
+            max_output_len=model_info["max_output_len"],
+            ptuning=ptuning,
+            p_tuning_checkpoint=p_tuning_checkpoint,
+            lora=lora,
+            lora_checkpoint=lora_checkpoint,
+            tp_size=tp_size,
+            pp_size=pp_size,
+            top_k=1,
+            top_p=0.0,
+            temperature=1.0,
+            run_accuracy=run_accuracy,
+            debug=True,
+            streaming=streaming,
+            stop_words_list=stop_words_list,
+            test_deployment=test_deployment,
+            test_data_path=test_data_path,
+            save_trt_engine=save_trt_engine,
+        )
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Deploy nemo models to Triton and benchmark the models",
+    )
+
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--existing_test_models",
+        default=False,
+        action='store_true',
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        required=False,
+    )
+    parser.add_argument(
+        "--min_gpus",
+        type=int,
+        default=1,
+        required=True,
+    )
+    parser.add_argument(
+        "--max_gpus",
+        type=int,
+    )
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        default="/tmp/nemo_checkpoint/",
+        required=False,
+    )
+    parser.add_argument(
+        "--trt_llm_model_dir",
+        type=str,
+    )
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--max_input_len",
+        type=int,
+        default=256,
+    )
+    parser.add_argument(
+        "--max_output_len",
+        type=int,
+        default=128,
+    )
+    parser.add_argument(
+        "--p_tuning_checkpoint",
+        type=str,
+    )
+    parser.add_argument(
+        "--ptuning",
+        default=False,
+        action='store_true',
+    )
+    parser.add_argument(
+        "--lora_checkpoint",
+        type=str,
+    )
+    parser.add_argument(
+        "--lora",
+        default=False,
+        action='store_true',
+    )
+    parser.add_argument(
+        "--tp_size",
+        type=int,
+    )
+    parser.add_argument(
+        "--pp_size",
+        type=int,
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=0.0,
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+    )
+    parser.add_argument(
+        "--run_accuracy",
+        type=str,
+        default="False",
+    )
+    parser.add_argument("--streaming", default=False, action="store_true")
+    parser.add_argument(
+        "--test_deployment",
+        type=str,
+        default="False",
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action='store_true',
+    )
+    parser.add_argument(
+        "--ci_upload_test_results_to_cloud",
+        default=False,
+        action='store_true',
+    )
+    parser.add_argument(
+        "--test_data_path",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "-b",
+        '--backend',
+        nargs='?',
+        const=None,
+        default='TensorRT-LLM',
+        choices=['TensorRT-LLM', 'vLLM', 'In-Framework'],
+        help="Different options to deploy nemo model.",
+    )
+    parser.add_argument(
+        "--save_trt_engine",
+        type=str,
+        default="False",
+    )
+
+    return parser.parse_args()
+
+
+def run_inference_tests(args):
+    if args.test_deployment == "True":
+        args.test_deployment = True
+    else:
+        args.test_deployment = False
+
+    if args.save_trt_engine == "True":
+        args.save_trt_engine = True
+    else:
+        args.save_trt_engine = False
+
+    if args.run_accuracy == "True":
+        args.run_accuracy = True
+    else:
+        args.run_accuracy = False
+
+    if args.run_accuracy:
+        if args.test_data_path is None:
+            raise Exception("test_data_path param cannot be None.")
+
+    result_dic = {}
+
+    if args.existing_test_models:
+        n_gpus = args.min_gpus
+        if args.max_gpus is None:
+            args.max_gpus = args.min_gpus
+
+        while n_gpus <= args.max_gpus:
+            result_dic[n_gpus] = run_existing_checkpoints(
+                model_name=args.model_name,
+                n_gpus=n_gpus,
+                ptuning=args.ptuning,
+                lora=args.lora,
+                tp_size=args.tp_size,
+                pp_size=args.pp_size,
+                streaming=args.streaming,
+                test_deployment=args.test_deployment,
+                run_accuracy=args.run_accuracy,
+                test_data_path=args.test_data_path,
+                backend=args.backend.lower(),
+                save_trt_engine=args.save_trt_engine,
+            )
+
+            n_gpus = n_gpus * 2
+    else:
+        prompt_template = ["The capital of France is", "Largest animal in the sea is"]
+        n_gpus = args.min_gpus
+        if args.max_gpus is None:
+            args.max_gpus = args.min_gpus
+
+        while n_gpus <= args.max_gpus:
+            if args.backend.lower() == "tensorrt-llm":
+                result_dic[n_gpus] = run_trt_llm_inference(
+                    model_name=args.model_name,
+                    model_type=args.model_type,
+                    prompt=prompt_template,
+                    checkpoint_path=args.checkpoint_dir,
+                    trt_llm_model_dir=args.trt_llm_model_dir,
+                    n_gpu=n_gpus,
+                    max_batch_size=args.max_batch_size,
+                    max_input_len=args.max_input_len,
+                    max_output_len=args.max_output_len,
+                    ptuning=args.ptuning,
+                    p_tuning_checkpoint=args.p_tuning_checkpoint,
+                    lora=args.lora,
+                    lora_checkpoint=args.lora_checkpoint,
+                    tp_size=args.tp_size,
+                    pp_size=args.pp_size,
+                    top_k=args.top_k,
+                    top_p=args.top_p,
+                    temperature=args.temperature,
+                    run_accuracy=args.run_accuracy,
+                    debug=args.debug,
+                    streaming=args.streaming,
+                    test_deployment=args.test_deployment,
+                    test_data_path=args.test_data_path,
+                    save_trt_engine=args.save_trt_engine,
+                )
+            else:
+                result_dic[n_gpus] = run_in_framework_inference(
+                    model_name=args.model_name,
+                    prompt=prompt_template,
+                    checkpoint_path=args.checkpoint_dir,
+                    n_gpu=n_gpus,
+                    max_batch_size=args.max_batch_size,
+                    max_input_len=args.max_input_len,
+                    max_output_len=args.max_output_len,
+                )
+
+            n_gpus = n_gpus * 2
+
+    test_result = "PASS"
+    print_separator = False
+    print("============= Test Summary ============")
+    for i, results in result_dic.items():
+        if not results[0] is None and not results[1] is None:
+            if print_separator:
+                print("---------------------------------------")
+            print(
+                "Number of GPUS:                  {}\n"
+                "Model Accuracy:                  {:.4f}\n"
+                "Relaxed Model Accuracy:          {:.4f}\n"
+                "Deployed Model Accuracy:         {:.4f}\n"
+                "Deployed Relaxed Model Accuracy: {:.4f}\n"
+                "Evaluation Time [s]:             {:.2f}".format(i, *results)
+            )
+            print_separator = True
+            if results[1] < 0.5:
+                test_result = "FAIL"
+
+    print("=======================================")
+    print("TEST: " + test_result)
+    if test_result == "FAIL":
+        raise Exception("Model accuracy is below 0.5")
+
+
+if __name__ == '__main__':
+    args = get_args()
+    run_inference_tests(args)
diff --git a/tests/deploy/pytriton_deploy.py b/tests/deploy/pytriton_deploy.py
deleted file mode 100644
index 3b722d2d7fec..000000000000
--- a/tests/deploy/pytriton_deploy.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import argparse
-
-import numpy as np
-from pytriton.client import ModelClient
-
-from nemo.deploy.deploy_pytriton import DeployPyTriton
-from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
-from nemo.deploy.nlp.query_llm import NemoTritonQueryLLMPyTorch
-
-
-def test_triton_deployable(args):
-    megatron_deployable = MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus)
-
-    prompts = ["What is the biggest planet in the solar system?", "What is the fastest steam locomotive in history?"]
-    url = "localhost:8000"
-    model_name = args.model_name
-    init_timeout = 600.0
-
-    nm = DeployPyTriton(
-        model=megatron_deployable,
-        triton_model_name=model_name,
-        triton_model_version=1,
-        max_batch_size=8,
-        port=8000,
-        address="0.0.0.0",
-        streaming=False,
-    )
-    nm.deploy()
-    nm.run()
-
-    # run once with NemoTritonQueryLLMPyTorch
-    nemo_triton_query = NemoTritonQueryLLMPyTorch(url, model_name)
-
-    result_dict = nemo_triton_query.query_llm(
-        prompts,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        max_length=args.max_output_token,
-        init_timeout=init_timeout,
-    )
-    print("NemoTritonQueryLLMPyTriton result:")
-    print(result_dict)
-
-    # run once with ModelClient, the results should be identical
-    str_ndarray = np.array(prompts)[..., np.newaxis]
-    prompts = np.char.encode(str_ndarray, "utf-8")
-    max_output_token = np.full(prompts.shape, args.max_output_token, dtype=np.int_)
-    top_k = np.full(prompts.shape, args.top_k, dtype=np.int_)
-    top_p = np.full(prompts.shape, args.top_p, dtype=np.single)
-    temperature = np.full(prompts.shape, args.temperature, dtype=np.single)
-
-    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
-        result_dict = client.infer_batch(
-            prompts=prompts,
-            max_length=max_output_token,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-        )
-        print("ModelClient result:")
-        print(result_dict)
-
-    # test logprobs generation
-    # right now we don't support batches where output data is inconsistent in size, so submitting each prompt individually
-    all_probs = np.full(prompts.shape, True, dtype=np.bool_)
-    compute_logprob = np.full(prompts.shape, True, dtype=np.bool_)
-    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
-        logprob_results = client.infer_batch(
-            prompts=prompts,
-            max_length=max_output_token,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            all_probs=all_probs,
-            compute_logprob=compute_logprob,
-        )
-        print("Logprob results:")
-        print(logprob_results)
-
-    nm.stop()
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Deploy nemo models to Triton and benchmark the models",
-    )
-
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--num_gpus",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--nemo_checkpoint",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=8,
-    )
-    parser.add_argument(
-        "--max_output_token",
-        type=int,
-        default=128,
-    )
-    parser.add_argument(
-        "--top_k",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--top_p",
-        type=float,
-        default=0.0,
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-    )
-
-    return parser.parse_args()
-
-
-if __name__ == '__main__':
-    args = get_args()
-    test_triton_deployable(args)