Skip to content

Commit ba1968f

Browse files
Add tps and pps params to the export script (#9558)
* fix minor import bug Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * fix export test Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * remove n_gpus param Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * add and fix parameters Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * fix deploy script Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> * Apply isort and black reformatting Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> * rename tps and pps params Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> --------- Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com> Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com> Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
1 parent b7e254e commit ba1968f

File tree

7 files changed

+283
-186
lines changed

7 files changed

+283
-186
lines changed

nemo/export/tensorrt_llm.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ def export(
119119
model_type: str,
120120
delete_existing_files: bool = True,
121121
n_gpus: int = 1,
122-
tensor_parallel_size: int = None,
123-
pipeline_parallel_size: int = None,
122+
tensor_parallelism_size: int = 1,
123+
pipeline_parallelism_size: int = 1,
124124
gpus_per_node: int = None,
125125
max_input_len: int = 256,
126126
max_output_len: int = 256,
@@ -151,8 +151,8 @@ def export(
151151
model_type (str): type of the model. Currently, "llama", "gptnext", "falcon", and "starcoder" are supported.
152152
delete_existing_files (bool): if Truen, deletes all the files in model_dir.
153153
n_gpus (int): number of GPUs to use for inference.
154-
tensor_parallel_size (int): tensor parallelism.
155-
pipeline_parallel_size (int): pipeline parallelism.
154+
tensor_parallelism_size (int): tensor parallelism.
155+
pipeline_parallelism_size (int): pipeline parallelism.
156156
gpus_per_node (int): number of gpus per node.
157157
max_input_len (int): max input length.
158158
max_output_len (int): max output length.
@@ -176,6 +176,15 @@ def export(
176176
save_nemo_model_config (bool):
177177
"""
178178

179+
if n_gpus is not None:
180+
warnings.warn(
181+
"Parameter n_gpus is deprecated and will be removed in the next release. "
182+
"Please use tensor_parallelism_size and pipeline_parallelism_size parameters instead.",
183+
DeprecationWarning,
184+
stacklevel=2,
185+
)
186+
tensor_parallelism_size = n_gpus
187+
179188
if model_type not in self.get_supported_models_list:
180189
raise Exception(
181190
"Model {0} is not currently a supported model type. "
@@ -188,14 +197,7 @@ def export(
188197
if model_type == "mixtral":
189198
model_type = "llama"
190199

191-
if pipeline_parallel_size is None:
192-
tensor_parallel_size = n_gpus
193-
pipeline_parallel_size = 1
194-
elif tensor_parallel_size is None:
195-
tensor_parallel_size = 1
196-
pipeline_parallel_size = n_gpus
197-
198-
gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
200+
gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
199201

200202
if Path(self.model_dir).exists():
201203
if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
@@ -253,8 +255,8 @@ def export(
253255
max_output_len=max_output_len,
254256
max_batch_size=max_batch_size,
255257
max_prompt_embedding_table_size=max_prompt_embedding_table_size,
256-
tensor_parallel_size=tensor_parallel_size,
257-
pipeline_parallel_size=pipeline_parallel_size,
258+
tensor_parallel_size=tensor_parallelism_size,
259+
pipeline_parallel_size=pipeline_parallelism_size,
258260
use_parallel_embedding=use_parallel_embedding,
259261
paged_kv_cache=paged_kv_cache,
260262
remove_input_padding=remove_input_padding,
@@ -273,8 +275,8 @@ def export(
273275
nemo_export_dir=nemo_export_dir,
274276
decoder_type=model_type,
275277
dtype=dtype,
276-
tensor_parallel_size=tensor_parallel_size,
277-
pipeline_parallel_size=pipeline_parallel_size,
278+
tensor_parallel_size=tensor_parallelism_size,
279+
pipeline_parallel_size=pipeline_parallelism_size,
278280
gpus_per_node=gpus_per_node,
279281
use_parallel_embedding=use_parallel_embedding,
280282
use_embedding_sharing=use_embedding_sharing,

scripts/deploy/nlp/deploy_triton.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ def get_args(argv):
8383
"-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
8484
)
8585
parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
86+
parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
87+
parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size")
8688
parser.add_argument(
8789
"-dt",
8890
"--dtype",
@@ -109,6 +111,13 @@ def get_args(argv):
109111
action='store_true',
110112
help="Disables the remove input padding option.",
111113
)
114+
parser.add_argument(
115+
"-upe",
116+
"--use_parallel_embedding",
117+
default=False,
118+
action='store_true',
119+
help='Use parallel embedding feature of TensorRT-LLM.',
120+
)
112121
parser.add_argument(
113122
"-mbm",
114123
'--multi_block_mode',
@@ -254,13 +263,14 @@ def get_trtllm_deployable(args):
254263
nemo_checkpoint_path=args.nemo_checkpoint,
255264
model_type=args.model_type,
256265
n_gpus=args.num_gpus,
257-
tensor_parallel_size=args.num_gpus,
258-
pipeline_parallel_size=1,
266+
tensor_parallelism_size=args.tensor_parallelism_size,
267+
pipeline_parallelism_size=args.pipeline_parallelism_size,
259268
max_input_len=args.max_input_len,
260269
max_output_len=args.max_output_len,
261270
max_batch_size=args.max_batch_size,
262271
max_num_tokens=args.max_num_tokens,
263272
opt_num_tokens=args.opt_num_tokens,
273+
use_parallel_embedding=args.use_parallel_embedding,
264274
max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
265275
paged_kv_cache=(not args.no_paged_kv_cache),
266276
remove_input_padding=(not args.disable_remove_input_padding),

scripts/export/export_to_trt_llm.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ def get_args(argv):
4040
"-mr", "--model_repository", required=True, default=None, type=str, help="Folder for the trt-llm model files"
4141
)
4242
parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
43-
parser.add_argument("-tps", "--tensor_parallelism_size", type=int, help="Tensor parallelism size")
44-
parser.add_argument("-pps", "--pipeline_parallelism_size", type=int, help="Pipeline parallelism size")
43+
parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
44+
parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size")
4545
parser.add_argument(
4646
"-dt",
4747
"--dtype",
@@ -138,8 +138,8 @@ def nemo_export_trt_llm(argv):
138138
nemo_checkpoint_path=args.nemo_checkpoint,
139139
model_type=args.model_type,
140140
n_gpus=args.num_gpus,
141-
tensor_parallel_size=args.tensor_parallelism_size,
142-
pipeline_parallel_size=args.pipeline_parallelism_size,
141+
tensor_parallelism_size=args.tensor_parallelism_size,
142+
pipeline_parallelism_size=args.pipeline_parallelism_size,
143143
max_input_len=args.max_input_len,
144144
max_output_len=args.max_output_len,
145145
max_batch_size=args.max_batch_size,

tests/deploy/nemo_deploy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,8 +241,8 @@ def run_trt_llm_inference(
241241
nemo_checkpoint_path=checkpoint_path,
242242
model_type=model_type,
243243
n_gpus=n_gpu,
244-
tensor_parallel_size=tp_size,
245-
pipeline_parallel_size=pp_size,
244+
tensor_parallelism_size=tp_size,
245+
pipeline_parallelism_size=pp_size,
246246
max_input_len=max_input_len,
247247
max_output_len=max_output_len,
248248
max_batch_size=max_batch_size,

0 commit comments

Comments
 (0)