NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 203 additions & 77 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 203 additions & 77 deletions
diff --git a/‎Dockerfile.ci‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.ci‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml‎
Lines changed: 0 additions & 4 deletions b/‎examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml‎
Lines changed: 0 additions & 4 deletions b/‎examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎examples/asr/speech_to_text_finetune.py‎
Lines changed: 6 additions & 2 deletions b/‎examples/asr/speech_to_text_finetune.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/audio/audio_to_audio_train.py‎
Lines changed: 4 additions & 0 deletions b/‎examples/audio/audio_to_audio_train.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/audio/conf/schroedinger_bridge.yaml‎
Lines changed: 164 additions & 0 deletions b/‎examples/audio/conf/schroedinger_bridge.yaml‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎examples/audio/process_audio.py‎
Lines changed: 21 additions & 1 deletion b/‎examples/audio/process_audio.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml‎
Lines changed: 1 addition & 0 deletions b/‎examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py‎
Lines changed: 4 additions & 4 deletions
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.13.0
-ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
+ARG MCORE_TAG=0b4c4cfced47cffad4cec8c4047986bfa60e7f10
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \
 
@@ -6,10 +6,6 @@ init_from_nemo_model: null # path to nemo model
 
 model:
   sample_rate: 16000
-  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
-  log_prediction: true # enables logging sample predictions in the output during training
-  rnnt_reduction: 'mean_volume'
-  skip_nan_grad: false
 
   train_ds:
     manifest_filepath: ???
 
@@ -7,10 +7,6 @@ init_from_pretrained_model: null  # name of pretrained NeMo model, e.g., `stt_en
 
 model:
   sample_rate: 16000
-  compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
-  log_prediction: true # enables logging sample predictions in the output during training
-  rnnt_reduction: 'mean_volume'
-  skip_nan_grad: false
 
   # configs for huggingface load_dataset function
   data_path: "librispeech_asr"
 
@@ -19,7 +19,11 @@
 1) `init_from_nemo_model` or 
 2) `init_from_pretrained_model` in the configuration.
 
-To update the model architecture in conjunction with other modifications, it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
+****************************************************************************************
+This script is mainly intended for changing the dataset, optim, spec_augment, vocabulary/tokenizer of the model.
+To update the model architecture in conjunction with other modifications, 
+it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
+****************************************************************************************
 
 Note: To create a single script for all model types, we currently only support two types of 
 initializations:
@@ -135,7 +139,7 @@ def check_vocabulary(asr_model, cfg):
 
 def update_tokenizer(asr_model, tokenizer_dir, tokenizer_type):
     """
-    Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size 
+    Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size
     of the new tokenizer differs from that of the loaded model.
     Args:
         asr_model: ASRModel instance
 
@@ -35,6 +35,7 @@
 from nemo.collections.audio.models.enhancement import (
     EncMaskDecAudioToAudioModel,
     PredictiveAudioToAudioModel,
+    SchroedingerBridgeAudioToAudioModel,
     ScoreBasedGenerativeAudioToAudioModel,
 )
 from nemo.core.config import hydra_runner
@@ -48,6 +49,7 @@ class ModelType(str, Enum):
     MaskBased = 'mask_based'
     Predictive = 'predictive'
     ScoreBased = 'score_based'
+    SchroedingerBridge = 'schroedinger_bridge'
 
 
 def get_model_class(model_type: ModelType):
@@ -58,6 +60,8 @@ def get_model_class(model_type: ModelType):
         return PredictiveAudioToAudioModel
     elif model_type == ModelType.ScoreBased:
         return ScoreBasedGenerativeAudioToAudioModel
+    elif model_type == ModelType.SchroedingerBridge:
+        return SchroedingerBridgeAudioToAudioModel
     else:
         raise ValueError(f'Unknown model type: {model_type}')
 
 
@@ -0,0 +1,164 @@
+name: schroedinger_bridge
+
+model:
+  type: schroedinger_bridge
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  normalize_input: true
+  max_utts_evaluation_metrics: 50 # metric calculation needs full inference and is slow, so we limit to first few files
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 2.04 # 256 frames
+    random_offset: true
+    normalize_input: ${model.normalize_input}
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    normalize_input: false # load data as is for validation, the model will normalize it for inference
+    batch_size: 4
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 510
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel estimate
+    conditioned_on_time: true
+    num_res_blocks: 3 # increased number of res blocks
+    pad_time_to: 64 # pad to 64 frames for the time dimension
+    pad_dimension_to: 0 # no padding in the frequency dimension
+
+  estimator_output: data_prediction
+
+  noise_schedule:
+    _target_: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVE
+    k: 2.6
+    c: 0.4
+    time_min: 1e-4
+    time_max: 1.0
+    num_steps: 1000 # num steps for the forward process
+
+  sampler:
+    _target_: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBSampler
+    time_min: 1e-4
+    time_max: 1.0
+    num_steps: 50 # num steps for the reverse process
+
+  # Loss in the encoded domain
+  loss_encoded:
+    _target_: nemo.collections.audio.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  # Loss in the time domain
+  loss_time:
+    _target_: nemo.collections.audio.losses.MAELoss
+  loss_time_weight: 0.001
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+      estoi: # output ESTOI
+        _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
+        fs: ${model.sample_rate}
+        extended: true
+      pesq: # output PESQ
+        _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
+        fs: ${model.sample_rate}
+        mode: wb
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: null
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 5 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_pesq
+    mode: max
+    save_top_k: 5
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
@@ -16,7 +16,7 @@
 import glob
 import json
 import os
-from dataclasses import dataclass, is_dataclass
+from dataclasses import dataclass, field, is_dataclass
 from pathlib import Path
 from typing import List, Optional
 
@@ -96,6 +96,10 @@ class ProcessConfig:
     # Override model config
     override_config_path: Optional[str] = None  # path to a yaml config that will override the internal config file
 
+    # Override sampler config
+    # For example, to set number of steps, use `++sampler.num_samples=42`
+    sampler: dict = field(default_factory=dict)
+
     # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
     # device anyway, and do inference on CPU only if CUDA device is not found.
     # If `cuda` is a negative number, inference will be on CPU only.
@@ -155,6 +159,22 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
     audio_to_audio_model.set_trainer(trainer)
     audio_to_audio_model = audio_to_audio_model.eval()
 
+    # override sampler
+    if cfg.sampler is not None:
+        logging.info('Overriding sampler with %s', cfg.sampler)
+
+        if hasattr(audio_to_audio_model, 'sampler'):
+            for key, value in cfg.sampler.items():
+                if not hasattr(audio_to_audio_model.sampler, key):
+                    raise RuntimeError(f'Model sampler does not have attribute {key}')
+                logging.debug('Try to set model.sampler.%s to %s', key, value)
+                setattr(audio_to_audio_model.sampler, key, value)
+                if getattr(audio_to_audio_model.sampler, key) != value:
+                    raise RuntimeError(f'Failed to set model sampler attribute {key} to {value}')
+                logging.info('model.sampler.%s was set to %s', key, value)
+        else:
+            raise RuntimeError('Model does not have a sampler')
+
     if cfg.audio_dir is not None:
         filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
     else:
 
@@ -153,6 +153,7 @@ model:
   resume_from_checkpoint: null # manually set the checkpoint file to load from
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
   gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  ddp_overlap: False # True for using PyTorch DDP overlap.
 
   optim:
     name: fused_adam
 
@@ -26,10 +26,10 @@ def model_cfg_modifier(model_cfg):
         model_cfg.precision = cfg.trainer.precision
         model_cfg.ckpt_path = None
         model_cfg.inductor = False
-        model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
-        model_cfg.unet_config.from_NeMo = True
-        model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
-        model_cfg.first_stage_config.from_NeMo = True
+        # model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
+        # model_cfg.unet_config.from_NeMo = True
+        # model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
+        # model_cfg.first_stage_config.from_NeMo = True
         model_cfg.first_stage_config._target_ = 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper'
         # model_cfg.fsdp = True