Merge branch 'main' of https://github.com/NVIDIA/NeMo into hillst-bn2/add-pipelineparallel-dtype

skothenhill-nv · skothenhill-nv · commit e57a4628272e · 2024-06-14T09:42:46.000-07:00
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -213,10 +213,10 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_quantization.py \
-          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           quantization.algorithm=null \
-          model_save=/home/TestData/nlp/megatron_llama/ci_baseline
+          export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
 
@@ -226,16 +226,16 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_quantization.py \
-          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          tensor_model_parallel_size=2 \
+        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.tensor_model_parallel_size=2 \
           trainer.devices=2 \
           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
           quantization.algorithm=fp8 \
           quantization.num_calib_size=8 \
           inference.batch_size=2 \
           export.inference_tensor_parallel=2 \
-          model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+          export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
 
@@ -245,13 +245,13 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_quantization.py \
-        model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
         quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
         quantization.algorithm=int8_sq \
         quantization.num_calib_size=8 \
         inference.batch_size=2 \
-        model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+        export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
 
@@ -274,15 +274,15 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_quantization.py \
-  #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #          tensor_model_parallel_size=1 \
+  #          python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+  #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #          model.tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
   #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
   #          quantization.algorithm=int4_awq \
   #          quantization.num_calib_size=8 \
   #          inference.batch_size=2 \
-  #          model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+  #          export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
   #
   #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
@@ -73,17 +73,17 @@ The script must be launched correctly with the number of processes equal to tens
 
 .. code-block:: bash
 
-    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
-        model_file=llama2-70b-base-bf16.nemo \
-        tensor_model_parallel_size=8 \
-        pipeline_model_parallel_size=1 \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        model.restore_from_path=llama2-70b-base-bf16.nemo \
+        model.tensor_model_parallel_size=8 \
+        model.pipeline_model_parallel_size=1 \
         trainer.num_nodes=1 \
         trainer.devices=8 \
         trainer.precision=bf16 \
         quantization.algorithm=fp8 \
         export.decoder_type=llama \
         export.inference_tensor_parallel=2 \
-        model_save=llama2-70b-base-fp8-qnemo
+        export.save_path=llama2-70b-base-fp8-qnemo
 
 
 
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_quantization.yaml
@@ -20,21 +20,26 @@ trainer:
   precision: bf16 # 16, 32, or bf16
   enable_checkpointing: false
 
+model:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  restore_from_path: llama2-7b-fp16.nemo # Nemo file path
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+
 quantization:
-  quantize_bmm1: false
-  algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
+  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  algorithm: fp8 # null, int8_sq, fp8, int4_awq
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
-  awq_block_size: 128 # block size for scaling factors in AWQ algorithm
-  alpha: 1.0 # alpha parameter in SmoothQuant algorithm
+  awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms)
+  sq_alpha: 1.0 # alpha parameter (only used in SmoothQuant algorithms)
 
 export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: bf16 # Default precision data type
-
-model_file: llama2-7b-fp16.nemo # Nemo file path
-model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
-tensor_model_parallel_size: 1
-pipeline_model_parallel_size: 1
+  dtype: ${trainer.precision} # Default precision data type
+  save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved
diff --git a/examples/nlp/language_modeling/megatron_gpt_quantization.py b/examples/nlp/language_modeling/megatron_gpt_quantization.py
@@ -15,32 +15,38 @@
 import torch
 import torch.multiprocessing as mp
 from datasets import load_dataset
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from tqdm import tqdm
 
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.export.quantize import Quantizer
+from nemo.utils.model_utils import load_config
 
 mp.set_start_method("spawn", force=True)
 
 """
 Nemo quantization example script.
 
 Please consult nemo.export.quantize.Quantizer class
-and examples/nlp/language_modeling/conf/megatron_quantization.yaml config on available quantization methods,
+and examples/nlp/language_modeling/conf/megatron_gpt_quantization.yaml config on available quantization methods,
 models supported as well as how to set up data and inference for calibration (with defaults recommended).
 
 Example usage:
 ```
-python examples/nlp/language_modeling/megatron_quantization.py \
-    model_file=llama2-7b-fp16.nemo \
-    model_save=llama2-7b-fp8.qnemo \
+python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+    model.restore_from_path=llama2-7b-fp16.nemo \
     quantization.algorithm=fp8 \
     export.decoder_type=llama \
     export.inference_tensor_parallel=1
+    export.save_path=llama2-7b-fp8.qnemo \
 ```
 """
 
 
-def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
+def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
     if data == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
         text_column = "text"
@@ -59,31 +65,46 @@ def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, ma
         yield batch
 
 
-@hydra_runner(config_path="conf", config_name="megatron_quantization")
+@hydra_runner(config_path="conf", config_name="megatron_gpt_quantization")
 def main(cfg) -> None:
     if not torch.cuda.is_available():
-        raise EnvironmentError("GPU is required for the inference.")
+        raise EnvironmentError("GPU is required for the quantization.")
 
-    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
+    # Initialize quantizer
+    quantizer = Quantizer(cfg.quantization, cfg.export)
+
+    # Overwrite model config with the one from the model checkpoint and apply quantization modifications
+    model_cfg = load_config(cfg.model.restore_from_path)
+    model_cfg.update(cfg.model)
+    model_cfg = quantizer.modify_model_config(model_cfg)
+
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+    model = MegatronGPTModel.restore_from(
+        restore_path=cfg.model.restore_from_path, override_config_path=model_cfg, trainer=trainer
+    )
+    model.freeze()
 
     # Quantization algorithm can be set to None. This is useful for baseline precision
     # accuracy validation. In this case only weights export step will be performed:
     if cfg.quantization.algorithm is not None:
-        dataloader = get_calib_dataloader(
+        data_iter = get_calib_data_iter(
             cfg.quantization.calib_dataset,
             cfg.inference.batch_size,
             cfg.quantization.num_calib_size,
             cfg.inference.max_context_length,
         )
-        dataloader = [data for data in dataloader]
-    else:
-        dataloader = None
+        dataloader = [data for data in data_iter]
 
-    model = quantizer.quantize(
-        cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
-    )
+        def forward_loop(model):
+            # NOTE: Alternatively you can also use `model.forward_bwd_step(data_iter, forward_only=True)`
+            # if your model is setup for training.
+            model.set_inference_config(OmegaConf.to_container(cfg.inference))
+            for i, batch in enumerate(tqdm(dataloader, desc="Calibrating")):
+                model.predict_step(batch, i)
+
+        model = quantizer.quantize(model, forward_loop)
 
-    quantizer.export(model, cfg.model_save)
+    quantizer.export(model)
 
 
 if __name__ == '__main__':
diff --git a/nemo/collections/asr/modules/audio_preprocessing.py b/nemo/collections/asr/modules/audio_preprocessing.py
@@ -100,7 +100,7 @@ def __init__(self, win_length, hop_length):
     @torch.no_grad()
     def forward(self, input_signal, length):
         if input_signal.dtype != torch.float32:
-            logging.warn(
+            logging.warning(
                 f"AudioPreprocessor received an input signal of dtype {input_signal.dtype}, rather than torch.float32. In sweeps across multiple datasets, we have found that the preprocessor is not robust to low precision  mathematics. As such, it runs in float32. Your input will be cast to float32, but this is not necessarily enough to recovery full accuracy. For example, simply casting input_signal from torch.float32 to torch.bfloat16, then back to torch.float32 before running AudioPreprocessor causes drops in absolute WER of up to 0.1%. torch.bfloat16 simply does not have enough mantissa bits to represent enough values in the range [-1.0,+1.0] correctly.",
                 mode=logging_mode.ONCE,
             )
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ def __init__(self, win_length, hop_length):`
`100`	`100`	`@torch.no_grad()`
`101`	`101`	`def forward(self, input_signal, length):`
`102`	`102`	`if input_signal.dtype != torch.float32:`
`103`		`- logging.warn(`
	`103`	`+ logging.warning(`
`104`	`104`	f"AudioPreprocessor received an input signal of dtype {input_signal.dtype}, rather than torch.float32. In sweeps across multiple datasets, we have found that the preprocessor is not robust to low precision mathematics. As such, it runs in float32. Your input will be cast to float32, but this is not necessarily enough to recovery full accuracy. For example, simply casting input_signal from torch.float32 to torch.bfloat16, then back to torch.float32 before running AudioPreprocessor causes drops in absolute WER of up to 0.1%. torch.bfloat16 simply does not have enough mantissa bits to represent enough values in the range [-1.0,+1.0] correctly.",
`105`	`105`	`mode=logging_mode.ONCE,`
`106`	`106`	`)`