Skip to content

Commit cca043d

Browse files
authored
Merge branch 'NVIDIA:main' into mcore_interface
2 parents bea3dbe + 7f16668 commit cca043d

File tree

113 files changed

+4710
-1258
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+4710
-1258
lines changed

.github/workflows/cicd-main.yml

Lines changed: 203 additions & 77 deletions
Large diffs are not rendered by default.

Dockerfile.ci

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ WORKDIR /workspace
3434
# Install NeMo requirements
3535
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
3636
ARG MODELOPT_VERSION=0.13.0
37-
ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
37+
ARG MCORE_TAG=0b4c4cfced47cffad4cec8c4047986bfa60e7f10
3838
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
3939
RUN \
4040
--mount=type=bind,source=requirements,target=requirements \

examples/asr/conf/asr_finetune/speech_to_text_finetune.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,6 @@ init_from_nemo_model: null # path to nemo model
66

77
model:
88
sample_rate: 16000
9-
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
10-
log_prediction: true # enables logging sample predictions in the output during training
11-
rnnt_reduction: 'mean_volume'
12-
skip_nan_grad: false
139

1410
train_ds:
1511
manifest_filepath: ???

examples/asr/conf/asr_finetune/speech_to_text_hf_finetune.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,6 @@ init_from_pretrained_model: null # name of pretrained NeMo model, e.g., `stt_en
77

88
model:
99
sample_rate: 16000
10-
compute_eval_loss: false # eval samples can be very long and exhaust memory. Disable computation of transducer loss during validation/testing with this flag.
11-
log_prediction: true # enables logging sample predictions in the output during training
12-
rnnt_reduction: 'mean_volume'
13-
skip_nan_grad: false
1410

1511
# configs for huggingface load_dataset function
1612
data_path: "librispeech_asr"

examples/asr/speech_to_text_finetune.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@
1919
1) `init_from_nemo_model` or
2020
2) `init_from_pretrained_model` in the configuration.
2121
22-
To update the model architecture in conjunction with other modifications, it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
22+
****************************************************************************************
23+
This script is mainly intended for changing the dataset, optim, spec_augment, vocabulary/tokenizer of the model.
24+
To update the model architecture in conjunction with other modifications,
25+
it is advisable to use the primary 'speech_to_text_rnnt/ctc_*.py' script.
26+
****************************************************************************************
2327
2428
Note: To create a single script for all model types, we currently only support two types of
2529
initializations:
@@ -135,7 +139,7 @@ def check_vocabulary(asr_model, cfg):
135139

136140
def update_tokenizer(asr_model, tokenizer_dir, tokenizer_type):
137141
"""
138-
Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size
142+
Updates the tokenizer of the model and also reinitializes the decoder if the vocabulary size
139143
of the new tokenizer differs from that of the loaded model.
140144
Args:
141145
asr_model: ASRModel instance

examples/audio/audio_to_audio_train.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from nemo.collections.audio.models.enhancement import (
3636
EncMaskDecAudioToAudioModel,
3737
PredictiveAudioToAudioModel,
38+
SchroedingerBridgeAudioToAudioModel,
3839
ScoreBasedGenerativeAudioToAudioModel,
3940
)
4041
from nemo.core.config import hydra_runner
@@ -48,6 +49,7 @@ class ModelType(str, Enum):
4849
MaskBased = 'mask_based'
4950
Predictive = 'predictive'
5051
ScoreBased = 'score_based'
52+
SchroedingerBridge = 'schroedinger_bridge'
5153

5254

5355
def get_model_class(model_type: ModelType):
@@ -58,6 +60,8 @@ def get_model_class(model_type: ModelType):
5860
return PredictiveAudioToAudioModel
5961
elif model_type == ModelType.ScoreBased:
6062
return ScoreBasedGenerativeAudioToAudioModel
63+
elif model_type == ModelType.SchroedingerBridge:
64+
return SchroedingerBridgeAudioToAudioModel
6165
else:
6266
raise ValueError(f'Unknown model type: {model_type}')
6367

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
name: schroedinger_bridge
2+
3+
model:
4+
type: schroedinger_bridge
5+
sample_rate: 16000
6+
skip_nan_grad: false
7+
num_outputs: 1
8+
normalize_input: true
9+
max_utts_evaluation_metrics: 50 # metric calculation needs full inference and is slow, so we limit to first few files
10+
11+
train_ds:
12+
manifest_filepath: ???
13+
input_key: noisy_filepath
14+
target_key: clean_filepath
15+
audio_duration: 2.04 # 256 frames
16+
random_offset: true
17+
normalize_input: ${model.normalize_input}
18+
batch_size: 8 # batch size may be increased based on the available memory
19+
shuffle: true
20+
num_workers: 8
21+
pin_memory: true
22+
23+
validation_ds:
24+
manifest_filepath: ???
25+
input_key: noisy_filepath
26+
target_key: clean_filepath
27+
normalize_input: false # load data as is for validation, the model will normalize it for inference
28+
batch_size: 4
29+
shuffle: false
30+
num_workers: 4
31+
pin_memory: true
32+
33+
encoder:
34+
_target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
35+
fft_length: 510
36+
hop_length: 128
37+
magnitude_power: 0.5
38+
scale: 0.33
39+
40+
decoder:
41+
_target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
42+
fft_length: ${model.encoder.fft_length}
43+
hop_length: ${model.encoder.hop_length}
44+
magnitude_power: ${model.encoder.magnitude_power}
45+
scale: ${model.encoder.scale}
46+
47+
estimator:
48+
_target_: nemo.collections.audio.parts.submodules.ncsnpp.SpectrogramNoiseConditionalScoreNetworkPlusPlus
49+
in_channels: 2 # concatenation of single-channel perturbed and noisy
50+
out_channels: 1 # single-channel estimate
51+
conditioned_on_time: true
52+
num_res_blocks: 3 # increased number of res blocks
53+
pad_time_to: 64 # pad to 64 frames for the time dimension
54+
pad_dimension_to: 0 # no padding in the frequency dimension
55+
56+
estimator_output: data_prediction
57+
58+
noise_schedule:
59+
_target_: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBNoiseScheduleVE
60+
k: 2.6
61+
c: 0.4
62+
time_min: 1e-4
63+
time_max: 1.0
64+
num_steps: 1000 # num steps for the forward process
65+
66+
sampler:
67+
_target_: nemo.collections.audio.parts.submodules.schroedinger_bridge.SBSampler
68+
time_min: 1e-4
69+
time_max: 1.0
70+
num_steps: 50 # num steps for the reverse process
71+
72+
# Loss in the encoded domain
73+
loss_encoded:
74+
_target_: nemo.collections.audio.losses.MSELoss
75+
ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
76+
77+
# Loss in the time domain
78+
loss_time:
79+
_target_: nemo.collections.audio.losses.MAELoss
80+
loss_time_weight: 0.001
81+
82+
metrics:
83+
val:
84+
sisdr: # output SI-SDR
85+
_target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
86+
estoi: # output ESTOI
87+
_target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
88+
fs: ${model.sample_rate}
89+
extended: true
90+
pesq: # output PESQ
91+
_target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
92+
fs: ${model.sample_rate}
93+
mode: wb
94+
95+
optim:
96+
name: adam
97+
lr: 1e-4
98+
# optimizer arguments
99+
betas: [0.9, 0.999]
100+
weight_decay: 0.0
101+
102+
trainer:
103+
devices: -1 # number of GPUs, -1 would use all available GPUs
104+
num_nodes: 1
105+
max_epochs: -1
106+
max_steps: -1 # computed at runtime if not set
107+
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
108+
accelerator: auto
109+
strategy: ddp
110+
accumulate_grad_batches: 1
111+
gradient_clip_val: null
112+
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
113+
log_every_n_steps: 25 # Interval of logging.
114+
enable_progress_bar: true
115+
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
116+
check_val_every_n_epoch: 5 # number of evaluations on validation every n epochs
117+
sync_batchnorm: true
118+
enable_checkpointing: false # Provided by exp_manager
119+
logger: false # Provided by exp_manager
120+
121+
exp_manager:
122+
exp_dir: null
123+
name: ${name}
124+
125+
# use exponential moving average for model parameters
126+
ema:
127+
enable: true
128+
decay: 0.999 # decay rate
129+
cpu_offload: false # offload EMA parameters to CPU to save GPU memory
130+
every_n_steps: 1 # how often to update EMA weights
131+
validate_original_weights: false # use original weights for validation calculation?
132+
133+
# logging
134+
create_tensorboard_logger: true
135+
136+
# checkpointing
137+
create_checkpoint_callback: true
138+
checkpoint_callback_params:
139+
# in case of multiple validation sets, first one is used
140+
monitor: val_pesq
141+
mode: max
142+
save_top_k: 5
143+
always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
144+
145+
# early stopping
146+
create_early_stopping_callback: true
147+
early_stopping_callback_params:
148+
monitor: val_sisdr
149+
mode: max
150+
min_delta: 0.0
151+
patience: 20 # patience in terms of check_val_every_n_epoch
152+
verbose: true
153+
strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
154+
155+
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
156+
# you need to set these two to true to continue the training
157+
resume_if_exists: false
158+
resume_ignore_no_checkpoint: false
159+
160+
# You may use this section to create a W&B logger
161+
create_wandb_logger: false
162+
wandb_logger_kwargs:
163+
name: null
164+
project: null

examples/audio/process_audio.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import glob
1717
import json
1818
import os
19-
from dataclasses import dataclass, is_dataclass
19+
from dataclasses import dataclass, field, is_dataclass
2020
from pathlib import Path
2121
from typing import List, Optional
2222

@@ -96,6 +96,10 @@ class ProcessConfig:
9696
# Override model config
9797
override_config_path: Optional[str] = None # path to a yaml config that will override the internal config file
9898

99+
# Override sampler config
100+
# For example, to set number of steps, use `++sampler.num_samples=42`
101+
sampler: dict = field(default_factory=dict)
102+
99103
# Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
100104
# device anyway, and do inference on CPU only if CUDA device is not found.
101105
# If `cuda` is a negative number, inference will be on CPU only.
@@ -155,6 +159,22 @@ def main(cfg: ProcessConfig) -> ProcessConfig:
155159
audio_to_audio_model.set_trainer(trainer)
156160
audio_to_audio_model = audio_to_audio_model.eval()
157161

162+
# override sampler
163+
if cfg.sampler is not None:
164+
logging.info('Overriding sampler with %s', cfg.sampler)
165+
166+
if hasattr(audio_to_audio_model, 'sampler'):
167+
for key, value in cfg.sampler.items():
168+
if not hasattr(audio_to_audio_model.sampler, key):
169+
raise RuntimeError(f'Model sampler does not have attribute {key}')
170+
logging.debug('Try to set model.sampler.%s to %s', key, value)
171+
setattr(audio_to_audio_model.sampler, key, value)
172+
if getattr(audio_to_audio_model.sampler, key) != value:
173+
raise RuntimeError(f'Failed to set model sampler attribute {key} to {value}')
174+
logging.info('model.sampler.%s was set to %s', key, value)
175+
else:
176+
raise RuntimeError('Model does not have a sampler')
177+
158178
if cfg.audio_dir is not None:
159179
filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True))
160180
else:

examples/multimodal/text_to_image/stable_diffusion/conf/sd2_train.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ model:
153153
resume_from_checkpoint: null # manually set the checkpoint file to load from
154154
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
155155
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
156+
ddp_overlap: False # True for using PyTorch DDP overlap.
156157

157158
optim:
158159
name: fused_adam

examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ def model_cfg_modifier(model_cfg):
2626
model_cfg.precision = cfg.trainer.precision
2727
model_cfg.ckpt_path = None
2828
model_cfg.inductor = False
29-
model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
30-
model_cfg.unet_config.from_NeMo = True
31-
model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
32-
model_cfg.first_stage_config.from_NeMo = True
29+
# model_cfg.unet_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/unet_nemo.ckpt"
30+
# model_cfg.unet_config.from_NeMo = True
31+
# model_cfg.first_stage_config.from_pretrained = "/opt/nemo-aligner/checkpoints/sdxl/vae_nemo.ckpt"
32+
# model_cfg.first_stage_config.from_NeMo = True
3333
model_cfg.first_stage_config._target_ = 'nemo.collections.multimodal.models.text_to_image.stable_diffusion.ldm.autoencoder.AutoencoderKLInferenceWrapper'
3434
# model_cfg.fsdp = True
3535

0 commit comments

Comments
 (0)