NVIDIA-NeMo · suiyoubi · Jun 17, 2024 · May 22, 2024 · May 22, 2024 · May 22, 2024
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,7 +12,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
 - Flash Attention
 - Activation Recomputation
 - Positional Embeddings and Positional Interpolation
-- Post-Training Quantization (PTQ) with Ammo
+- Post-Training Quantization (PTQ) with ModelOpt
 - Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:

diff --git a/docs/source/multimodal/text2img/sdxl_quantization.rst b/docs/source/multimodal/text2img/sdxl_quantization.rst
@@ -1,11 +1,11 @@
 Stable Diffusion XL Int8 Quantization
 =======================================
 
-This example shows how to use Ammo to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
+This example shows how to use ModelOpt to calibrate and quantize the UNet part of the SDXL. The UNet part typically consumes
 >95% of the e2e Stable Diffusion latency.
 
 We also provide instructions on deploying and running E2E SDXL pipeline
-with Ammo quantized int8 UNet to generate images and measure latency on target GPUs.
+with ModelOpt quantized int8 UNet to generate images and measure latency on target GPUs.
 
 To get started, it is required to have a pretrained SDXL checkpoint in ``nemo`` format. The example training configs are provided in NeMo,
 which is located in ``NeMo/examples/multimodal/text2img/stable_diffusion``.
@@ -104,31 +104,31 @@ GPU: H100
 TRT int8 vs Framework fp16
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-+---------------------+------------+-------------+----------------+------------+---------+------------+
-| Pipeline            | Batch Size | Latency (ms)| Pipeline       | Batch Size | Latency | Speedup    |
-+=====================+============+=============+================+============+=========+============+
-| Framework fp16 base | 1          | 3056.01     | Ammo TRT Int8  | 1          | 1406.68 | 2.172498365|
-+---------------------+------------+-------------+----------------+------------+---------+------------+
-| Framework fp16 base | 2          | 4832.24     | Ammo TRT Int8  | 2          | 2403.29 | 2.01067703 |
-+---------------------+------------+-------------+----------------+------------+---------+------------+
-| Framework fp16 base | 4          | 8433.71     | Ammo TRT Int8  | 4          | 4252.6  | 1.983189108|
-+---------------------+------------+-------------+----------------+------------+---------+------------+
++---------------------+------------+-------------+--------------------+------------+---------+------------+
+| Pipeline            | Batch Size | Latency (ms)| Pipeline           | Batch Size | Latency | Speedup    |
++=====================+============+=============+====================+============+=========+============+
+| Framework fp16 base | 1          | 3056.01     | ModelOpt TRT Int8  | 1          | 1406.68 | 2.172498365|
++---------------------+------------+-------------+--------------------+------------+---------+------------+
+| Framework fp16 base | 2          | 4832.24     | ModelOpt TRT Int8  | 2          | 2403.29 | 2.01067703 |
++---------------------+------------+-------------+--------------------+------------+---------+------------+
+| Framework fp16 base | 4          | 8433.71     | ModelOpt TRT Int8  | 4          | 4252.6  | 1.983189108|
++---------------------+------------+-------------+--------------------+------------+---------+------------+
 
 
 
 TRT int8 vs TRT fp16
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 
-+-------------+------------+--------------+-----------+------------+------------+-------------+
-| Pipeline    | Batch Size | Latency (ms) | Precision | Batch Size | Latency    | Speedup     |
-+=============+============+==============+===========+============+============+=============+
-| fp16 base   | 1          | 1723.97      | Ammo Int8 | 1          | 1406.68    | 1.225559473 |
-+-------------+------------+--------------+-----------+------------+------------+-------------+
-| fp16 base   | 2          | 3004.47      | Ammo Int8 | 2          | 2403.29    | 1.250148754 |
-+-------------+------------+--------------+-----------+------------+------------+-------------+
-| fp16 base   | 4          | 5657.19      | Ammo Int8 | 4          | 4252.6     | 1.330289705 |
-+-------------+------------+--------------+-----------+------------+------------+-------------+
++-------------+------------+--------------+---------------+------------+------------+-------------+
+| Pipeline    | Batch Size | Latency (ms) | Precision     | Batch Size | Latency    | Speedup     |
++=============+============+==============+===============+============+============+=============+
+| fp16 base   | 1          | 1723.97      | ModelOpt Int8 | 1          | 1406.68    | 1.225559473 |
++-------------+------------+--------------+---------------+------------+------------+-------------+
+| fp16 base   | 2          | 3004.47      | ModelOpt Int8 | 2          | 2403.29    | 1.250148754 |
++-------------+------------+--------------+---------------+------------+------------+-------------+
+| fp16 base   | 4          | 5657.19      | ModelOpt Int8 | 4          | 4252.6     | 1.330289705 |
++-------------+------------+--------------+---------------+------------+------------+-------------+
 
 
 FP16 inference vs Int8 inference

diff --git a/examples/multimodal/multimodal_llm/neva/neva_evaluation.py b/examples/multimodal/multimodal_llm/neva/neva_evaluation.py
@@ -24,13 +24,13 @@
 
 
 try:
-    import ammo.torch.quantization as atq
+    import modelopt.torch.quantization as mtq
 
-    HAVE_AMMO = True
+    HAVE_MODELOPT = True
 
 except (ImportError, ModuleNotFoundError):
 
-    HAVE_AMMO = False
+    HAVE_MODELOPT = False
 
 if not torch.cuda.is_available():
     raise EnvironmentError("GPU is needed for the inference")
@@ -41,7 +41,9 @@ def __init__(self, sentences):
         super().__init__()
         self.sentences = sentences
 
-    def __len__(self,):
+    def __len__(
+        self,
+    ):
         return len(self.sentences)
 
     def __getitem__(self, idx):
@@ -99,14 +101,14 @@ def main(cfg) -> None:
     )
 
     # =================== Start Quantization ====================
-    if HAVE_AMMO and cfg.quantization.enable == True:
+    if HAVE_MODELOPT and cfg.quantization.enable == True:
         print(f"Using quantization algorithm: {cfg.quantization.algorithm}")
         if cfg.quantization.algorithm == "int8_sq":
-            atq_config = atq.INT8_SMOOTHQUANT_CFG
+            mtq_config = mtq.INT8_SMOOTHQUANT_CFG
         elif cfg.quantization.algorithm == "fp8":
-            atq_config = atq.FP8_DEFAULT_CFG
+            mtq_config = mtq.FP8_DEFAULT_CFG
         elif cfg.quantization.algorithm == "awq":
-            atq_config = atq.INT4_AWQ_CFG
+            mtq_config = mtq.INT4_AWQ_CFG
         else:
             raise ValueError(f"Unsupported quantization algorithm: {cfg.quantization.algorithm}")
 
@@ -118,7 +120,7 @@ def forward_loop():
                 inference_config=cfg,
             )
 
-        atq.quantize(model, atq_config, forward_loop)
+        mtq.quantize(model, mtq_config, forward_loop)
 
         responses = model.generate(
             input_prompts=final_prompts,

diff --git a/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py b/examples/multimodal/text_to_image/stable_diffusion/sd_xl_quantize.py
@@ -15,10 +15,10 @@
 import os
 from pathlib import Path
 
-import ammo.torch.opt as ato
-import ammo.torch.quantization as atq
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
 import torch
-from ammo.torch.quantization.nn import QuantModuleRegistry
+from modelopt.torch.quantization.nn import QuantModuleRegistry
 from torch.onnx import export as onnx_export
 
 from nemo.collections.multimodal.models.text_to_image.stable_diffusion.diffusion_engine import MegatronDiffusionEngine
@@ -92,7 +92,7 @@ def model_cfg_modifier(model_cfg):
     QuantModuleRegistry.register({LinearWrapper: "nemo_linear_wrapper"})(_QuantNeMoLinearWrapper)
 
     if cfg.run_quantization:
-        # Start quantization with ammo
+        # Start quantization with ModelOpt
 
         cali_prompts = load_calib_prompts(
             cfg.quantize.batch_size,
@@ -124,15 +124,15 @@ def forward_loop():
                 num_samples=cfg.infer.num_samples,
             )
 
-        atq.quantize(base.model.model.diffusion_model, quant_config, forward_loop)
-        ato.save(base.model.model.diffusion_model, cfg.quantize.quantized_ckpt)
+        mtq.quantize(base.model.model.diffusion_model, quant_config, forward_loop)
+        mto.save(base.model.model.diffusion_model, cfg.quantize.quantized_ckpt)
 
     if cfg.run_onnx_export:
         os.makedirs(cfg.onnx_export.onnx_dir, exist_ok=True)
         output = Path(f"{cfg.onnx_export.onnx_dir}/unet.onnx")
         # Export quantized model to ONNX
         if not cfg.run_quantization:
-            ato.restore(base.model.model.diffusion_model, cfg.onnx_export.quantized_ckpt)
+            mto.restore(base.model.model.diffusion_model, cfg.onnx_export.quantized_ckpt)
         quantize_lvl(base.model.model.diffusion_model, cfg.quantize.quant_level)
 
         # QDQ needs to be in FP32

diff --git a/nemo/collections/multimodal/modules/stable_diffusion/quantization_utils/plugin_calib.py b/nemo/collections/multimodal/modules/stable_diffusion/quantization_utils/plugin_calib.py
@@ -14,8 +14,8 @@
 
 import torch
 
-from ammo.torch.quantization import utils as quant_utils
-from ammo.torch.quantization.calib.max import MaxCalibrator
+from modelopt.torch.quantization import utils as quant_utils
+from modelopt.torch.quantization.calib.max import MaxCalibrator
 
 
 class PercentileCalibrator(MaxCalibrator):

diff --git a/nemo/collections/multimodal/modules/stable_diffusion/quantization_utils/utils.py b/nemo/collections/multimodal/modules/stable_diffusion/quantization_utils/utils.py
@@ -14,7 +14,7 @@
 
 import re
 import torch
-from ammo.torch.quantization.nn import QuantLinear, QuantLinearConvBase
+from modelopt.torch.quantization.nn import QuantLinear, QuantLinearConvBase
 
 from nemo.collections.multimodal.modules.stable_diffusion.attention import LinearWrapper
 from .plugin_calib import PercentileCalibrator
@@ -110,7 +110,7 @@ def get_int8_config(model, quant_level=3, alpha=0.8, percentile=1.0, num_inferen
 def quantize_lvl(unet, quant_level=2.5):
     """
     We should disable the unwanted quantizer when exporting the onnx
-    Because in the current ammo setting, it will load the quantizer amax for all the layers even
+    Because in the current ModelOpt setting, it will load the quantizer amax for all the layers even
     if we didn't add that unwanted layer into the config during the calibration
     """
     for name, module in unet.named_modules():

diff --git a/tutorials/multimodal/SDXL Quantization.ipynb b/tutorials/multimodal/SDXL Quantization.ipynb