Refactor Quantizer for reusing in QAT (#9276)

kevalmorabia97 · web-flow · commit 3f7e8282eee0 · 2024-06-14T17:05:48.000+02:00
* Refactor Quantizer for reusing in QAT

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;

* Address more reviewer comments

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;

* update yaml config

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;

---------

Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -213,10 +213,10 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_quantization.py \
-          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           quantization.algorithm=null \
-          model_save=/home/TestData/nlp/megatron_llama/ci_baseline
+          export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_baseline
 
@@ -226,16 +226,16 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_quantization.py \
-          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          tensor_model_parallel_size=2 \
+        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.tensor_model_parallel_size=2 \
           trainer.devices=2 \
           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
           quantization.algorithm=fp8 \
           quantization.num_calib_size=8 \
           inference.batch_size=2 \
           export.inference_tensor_parallel=2 \
-          model_save=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
+          export.save_path=/home/TestData/nlp/megatron_llama/ci_fp8.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_fp8.qnemo
 
@@ -245,13 +245,13 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_quantization.py \
-        model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
         quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
         quantization.algorithm=int8_sq \
         quantization.num_calib_size=8 \
         inference.batch_size=2 \
-        model_save=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
+        export.save_path=/home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_llama/ci_int8_sq.qnemo
 
@@ -274,15 +274,15 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_quantization.py \
-  #          model_file=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #          tensor_model_parallel_size=1 \
+  #          python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+  #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+  #          model.tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
   #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
   #          quantization.algorithm=int4_awq \
   #          quantization.num_calib_size=8 \
   #          inference.batch_size=2 \
-  #          model_save=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
+  #          export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
   #
   #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
@@ -73,17 +73,17 @@ The script must be launched correctly with the number of processes equal to tens
 
 .. code-block:: bash
 
-    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_quantization.py \
-        model_file=llama2-70b-base-bf16.nemo \
-        tensor_model_parallel_size=8 \
-        pipeline_model_parallel_size=1 \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        model.restore_from_path=llama2-70b-base-bf16.nemo \
+        model.tensor_model_parallel_size=8 \
+        model.pipeline_model_parallel_size=1 \
         trainer.num_nodes=1 \
         trainer.devices=8 \
         trainer.precision=bf16 \
         quantization.algorithm=fp8 \
         export.decoder_type=llama \
         export.inference_tensor_parallel=2 \
-        model_save=llama2-70b-base-fp8-qnemo
+        export.save_path=llama2-70b-base-fp8-qnemo
 
 
 
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_quantization.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_quantization.yaml
@@ -20,21 +20,26 @@ trainer:
   precision: bf16 # 16, 32, or bf16
   enable_checkpointing: false
 
+model:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  restore_from_path: llama2-7b-fp16.nemo # Nemo file path
+
+  ## Activation Checkpoint
+  activations_checkpoint_granularity: null # 'selective' or 'full'
+  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
+
 quantization:
-  quantize_bmm1: false
-  algorithm: fp8 # int8_sq, fp8, int8, int4_awq, null
+  decoder_type: ${export.decoder_type} # gptnext, gpt2, llama
+  algorithm: fp8 # null, int8_sq, fp8, int4_awq
   calib_dataset: cnn_dailymail # wikitext, cnn_dailymail, or a local dataset
   num_calib_size: 512 # number of samples used for calibration
-  awq_block_size: 128 # block size for scaling factors in AWQ algorithm
-  alpha: 1.0 # alpha parameter in SmoothQuant algorithm
+  awq_block_size: 128 # block size for scaling factors (only used in AWQ algorithms)
+  sq_alpha: 1.0 # alpha parameter (only used in SmoothQuant algorithms)
 
 export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: bf16 # Default precision data type
-
-model_file: llama2-7b-fp16.nemo # Nemo file path
-model_save: llama2-7b-fp8.qnemo # Path where the quantized model will be saved
-tensor_model_parallel_size: 1
-pipeline_model_parallel_size: 1
+  dtype: ${trainer.precision} # Default precision data type
+  save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved
diff --git a/examples/nlp/language_modeling/megatron_gpt_quantization.py b/examples/nlp/language_modeling/megatron_gpt_quantization.py
@@ -15,32 +15,38 @@
 import torch
 import torch.multiprocessing as mp
 from datasets import load_dataset
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+from tqdm import tqdm
 
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.export.quantize import Quantizer
+from nemo.utils.model_utils import load_config
 
 mp.set_start_method("spawn", force=True)
 
 """
 Nemo quantization example script.
 
 Please consult nemo.export.quantize.Quantizer class
-and examples/nlp/language_modeling/conf/megatron_quantization.yaml config on available quantization methods,
+and examples/nlp/language_modeling/conf/megatron_gpt_quantization.yaml config on available quantization methods,
 models supported as well as how to set up data and inference for calibration (with defaults recommended).
 
 Example usage:
 ```
-python examples/nlp/language_modeling/megatron_quantization.py \
-    model_file=llama2-7b-fp16.nemo \
-    model_save=llama2-7b-fp8.qnemo \
+python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+    model.restore_from_path=llama2-7b-fp16.nemo \
     quantization.algorithm=fp8 \
     export.decoder_type=llama \
     export.inference_tensor_parallel=1
+    export.save_path=llama2-7b-fp8.qnemo \
 ```
 """
 
 
-def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
+def get_calib_data_iter(data="cnn_dailymail", batch_size=64, calib_size=512, max_sequence_length=512):
     if data == "wikitext":
         dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
         text_column = "text"
@@ -59,31 +65,46 @@ def get_calib_dataloader(data="cnn_dailymail", batch_size=64, calib_size=512, ma
         yield batch
 
 
-@hydra_runner(config_path="conf", config_name="megatron_quantization")
+@hydra_runner(config_path="conf", config_name="megatron_gpt_quantization")
 def main(cfg) -> None:
     if not torch.cuda.is_available():
-        raise EnvironmentError("GPU is required for the inference.")
+        raise EnvironmentError("GPU is required for the quantization.")
 
-    quantizer = Quantizer(cfg.quantization, cfg.inference, cfg.export, cfg.trainer)
+    # Initialize quantizer
+    quantizer = Quantizer(cfg.quantization, cfg.export)
+
+    # Overwrite model config with the one from the model checkpoint and apply quantization modifications
+    model_cfg = load_config(cfg.model.restore_from_path)
+    model_cfg.update(cfg.model)
+    model_cfg = quantizer.modify_model_config(model_cfg)
+
+    trainer = Trainer(strategy=NLPDDPStrategy(), **cfg.trainer)
+    model = MegatronGPTModel.restore_from(
+        restore_path=cfg.model.restore_from_path, override_config_path=model_cfg, trainer=trainer
+    )
+    model.freeze()
 
     # Quantization algorithm can be set to None. This is useful for baseline precision
     # accuracy validation. In this case only weights export step will be performed:
     if cfg.quantization.algorithm is not None:
-        dataloader = get_calib_dataloader(
+        data_iter = get_calib_data_iter(
             cfg.quantization.calib_dataset,
             cfg.inference.batch_size,
             cfg.quantization.num_calib_size,
             cfg.inference.max_context_length,
         )
-        dataloader = [data for data in dataloader]
-    else:
-        dataloader = None
+        dataloader = [data for data in data_iter]
 
-    model = quantizer.quantize(
-        cfg.model_file, dataloader, cfg.tensor_model_parallel_size, cfg.pipeline_model_parallel_size
-    )
+        def forward_loop(model):
+            # NOTE: Alternatively you can also use `model.forward_bwd_step(data_iter, forward_only=True)`
+            # if your model is setup for training.
+            model.set_inference_config(OmegaConf.to_container(cfg.inference))
+            for i, batch in enumerate(tqdm(dataloader, desc="Calibrating")):
+                model.predict_step(batch, i)
+
+        model = quantizer.quantize(model, forward_loop)
 
-    quantizer.export(model, cfg.model_save)
+    quantizer.export(model)
 
 
 if __name__ == '__main__':
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
diff --git a/nemo/utils/distributed.py b/nemo/utils/distributed.py