|
| 1 | +name: nemo_video_lita_neva |
| 2 | +restore_from_path: null # used when starting from a .nemo file |
| 3 | + |
| 4 | +trainer: |
| 5 | + devices: 1 |
| 6 | + num_nodes: 1 |
| 7 | + accelerator: gpu |
| 8 | + precision: bf16 |
| 9 | + logger: False # logger provided by exp_manager |
| 10 | + enable_checkpointing: False |
| 11 | + use_distributed_sampler: False |
| 12 | + max_epochs: -1 # PTL default. In practice, max_steps will be reached first. |
| 13 | + max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches |
| 14 | + log_every_n_steps: 10 |
| 15 | + val_check_interval: 100 |
| 16 | + check_val_every_n_epoch: null |
| 17 | + limit_val_batches: 50 |
| 18 | + limit_test_batches: 500 |
| 19 | + accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models |
| 20 | + gradient_clip_val: 1.0 |
| 21 | + benchmark: False |
| 22 | + enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually |
| 23 | + |
| 24 | +exp_manager: |
| 25 | + explicit_log_dir: null |
| 26 | + exp_dir: null |
| 27 | + name: nemo_video_neva_lita |
| 28 | + create_wandb_logger: True |
| 29 | + wandb_logger_kwargs: |
| 30 | + project: null |
| 31 | + name: null |
| 32 | + resume_if_exists: True |
| 33 | + resume_ignore_no_checkpoint: True |
| 34 | + resume_from_checkpoint: ${model.resume_from_checkpoint} |
| 35 | + create_checkpoint_callback: True |
| 36 | + checkpoint_callback_params: |
| 37 | + monitor: val_loss |
| 38 | + save_top_k: 5 |
| 39 | + mode: min |
| 40 | + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel |
| 41 | + save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits |
| 42 | + filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}' |
| 43 | + model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}} |
| 44 | + ema: |
| 45 | + enable: False |
| 46 | + decay: 0.9999 |
| 47 | + validate_original_weights: False |
| 48 | + every_n_steps: 1 |
| 49 | + cpu_offload: False |
| 50 | + |
| 51 | +model: |
| 52 | + precision: ${trainer.precision} |
| 53 | + |
| 54 | + # specify micro_batch_size, global_batch_size, and model parallelism |
| 55 | + # gradient accumulation will be done automatically based on data_parallel_size |
| 56 | + |
| 57 | + # Batch size guideline for different types of dataset |
| 58 | + micro_batch_size: 1 # limited by GPU memory |
| 59 | + global_batch_size: 2 # will use more micro batches to reach global batch size |
| 60 | + |
| 61 | + tensor_model_parallel_size: 1 # intra-layer model parallelism |
| 62 | + pipeline_model_parallel_size: 1 # inter-layer model parallelism |
| 63 | + context_parallel_size: 1 # kqv model parallelism |
| 64 | + virtual_pipeline_model_parallel_size: null # interleaved pipeline |
| 65 | + |
| 66 | + restore_from_path: null # used in fine-tuning |
| 67 | + |
| 68 | + # Multimodal configs |
| 69 | + mm_cfg: |
| 70 | + llm: |
| 71 | + from_pretrained: null #path to nemo checkpoint |
| 72 | + freeze: False |
| 73 | + model_type: llama_2 # `nvgpt` or `llama_2` supported |
| 74 | + vision_encoder: |
| 75 | + from_pretrained: "Lin-Chen/ShareGPT4V-13B_Pretrained_vit-large336-l12" # huggingface path or name |
| 76 | + from_hf: True |
| 77 | + crop_size: [336, 336] |
| 78 | + patch_dim: 14 |
| 79 | + hidden_size: 1024 # could be found from model but tricky in code |
| 80 | + vision_select_layer: -2 # default to the last layer |
| 81 | + class_token_length: 1 |
| 82 | + freeze: True |
| 83 | + lita: |
| 84 | + lita_video_arch: 'temporal_all_resolution' # ['temporal_spatial_pool', 'temporal_spatial', 'temporal_all_resolution'] 'temporal_spatial_pool' is used in lita1.0 |
| 85 | + visual_token_format: 'im_vid_start_end' # ["v1", "im_vid_start_end"] v1 means do nothing, im_vid_start_end means add image and video start and end tokens around spatial and temporal tokens |
| 86 | + sample_frames: 4 # for lita 1.5 sample_frames are used for spatial tokens, and spatial tokens will no longer do pooling and instead, it will use full tokens |
| 87 | + use_lita: True |
| 88 | + pretrain_mm_mlp_adapter: null # path to pretrained mm adapter |
| 89 | + mm_mlp_adapter_type: mlp2x_gelu # ['linear', 'mlp2x_gelu', 'mlp_downsample'] |
| 90 | + use_im_start_end: False |
| 91 | + |
| 92 | + # ========LORA configs start======= |
| 93 | + #peft: |
| 94 | + # peft_scheme: "lora" |
| 95 | + # restore_from_path: null |
| 96 | + # lora_tuning: |
| 97 | + # adapter_dim: 128 |
| 98 | + # alpha: 256 |
| 99 | + # target_modules: ['all'] |
| 100 | + # adapter_dropout: 0.0 |
| 101 | + # column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal |
| 102 | + # row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal |
| 103 | + # layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers |
| 104 | + # weight_tying: False |
| 105 | + # position_embedding_strategy: null # used only when weight_tying is True |
| 106 | + # =======LORA configs end======= |
| 107 | + |
| 108 | + # LLM configs |
| 109 | + # use GPTModel from megatron.core |
| 110 | + mcore_gpt: True |
| 111 | + |
| 112 | + # model architecture |
| 113 | + encoder_seq_length: 4096 |
| 114 | + max_position_embeddings: ${.encoder_seq_length} |
| 115 | + position_embedding_type: rope |
| 116 | + num_layers: 32 |
| 117 | + hidden_size: 4096 |
| 118 | + ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size. |
| 119 | + num_attention_heads: 32 |
| 120 | + init_method_std: 0.014 # Standard deviation of the zero mean normal distribution used for weight initialization.') |
| 121 | + use_scaled_init_method: True # use scaled residuals initialization |
| 122 | + hidden_dropout: 0.0 # Dropout probability for hidden state transformer. |
| 123 | + attention_dropout: 0.0 # Dropout probability for attention |
| 124 | + ffn_dropout: 0.0 # Dropout probability in the feed-forward layer. |
| 125 | + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null |
| 126 | + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. |
| 127 | + normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm' |
| 128 | + layernorm_epsilon: 1e-5 |
| 129 | + do_layer_norm_weight_decay: False # True means weight decay on all params |
| 130 | + make_vocab_size_divisible_by: 16 # Pad the vocab size to be divisible by this value for computation efficiency. |
| 131 | + pre_process: True # add embedding |
| 132 | + post_process: True # add pooler |
| 133 | + persist_layer_norm: True # Use of persistent fused layer norm kernel. |
| 134 | + bias: False # Whether to use bias terms in all weight matrices. |
| 135 | + activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu'] |
| 136 | + headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head. |
| 137 | + transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer'] |
| 138 | + normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True. |
| 139 | + rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this. |
| 140 | + attention_type: 'multihead' # Attention type. Options ['multihead'] |
| 141 | + share_embeddings_and_output_weights: False # Share embedding and output layer weights. |
| 142 | + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 |
| 143 | + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 |
| 144 | + seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595. |
| 145 | + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. |
| 146 | + |
| 147 | + ## Activation Checkpointing |
| 148 | + activations_checkpoint_granularity: null # 'selective' or 'full' |
| 149 | + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' |
| 150 | + activations_checkpoint_num_layers: null # not used with 'selective' |
| 151 | + num_micro_batches_with_partial_activation_checkpoints: null |
| 152 | + activations_checkpoint_layers_per_pipeline: null |
| 153 | + sequence_parallel: False |
| 154 | + |
| 155 | + # precision |
| 156 | + native_amp_init_scale: 4294967296 # 2 ** 32 |
| 157 | + native_amp_growth_interval: 1000 |
| 158 | + hysteresis: 2 # Gradient scale hysteresis |
| 159 | + fp32_residual_connection: False # Move residual connections to fp32 |
| 160 | + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 |
| 161 | + |
| 162 | + # model fusions |
| 163 | + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. |
| 164 | + bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition. |
| 165 | + |
| 166 | + use_cpu_initialization: False # Init weights on the CPU (slow for large models) |
| 167 | + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. |
| 168 | + gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism. |
| 169 | + openai_gelu: False |
| 170 | + bias_activation_fusion: False |
| 171 | + megatron_legacy: False |
| 172 | + |
| 173 | + transformer_engine: True |
| 174 | + fp8: False # enables fp8 in TransformerLayer forward |
| 175 | + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 |
| 176 | + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID |
| 177 | + fp8_margin: 0 # scaling margin |
| 178 | + fp8_interval: 1 # scaling update interval |
| 179 | + fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor |
| 180 | + fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history |
| 181 | + use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False. |
| 182 | + |
| 183 | + # Megatron O2-style half-precision |
| 184 | + megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters |
| 185 | + async_grad_allreduce: False |
| 186 | + grad_allreduce_chunk_size_mb: 125 |
| 187 | + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce |
| 188 | + |
| 189 | + # miscellaneous |
| 190 | + seed: 1234 |
| 191 | + resume_from_checkpoint: null # manually set the checkpoint file to load from |
| 192 | + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this |
| 193 | + gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) |
| 194 | + |
| 195 | + tokenizer: |
| 196 | + library: 'sentencepiece' |
| 197 | + type: null |
| 198 | + model: /ws/converted_nemo_model/tokenizer_1_5.model |
| 199 | + vocab_file: null |
| 200 | + merge_file: null |
| 201 | + delimiter: null # only used for tabular tokenizer |
| 202 | + sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers. |
| 203 | + additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"] |
| 204 | + |
| 205 | + data: |
| 206 | + packed_sequence: False |
| 207 | + num_workers: 8 |
| 208 | + dataloader_type: cyclic |
| 209 | + data_path: null |
| 210 | + lazy_preprocess: True |
| 211 | + is_multimodal: True |
| 212 | + media_type: video # currently supported: image or video |
| 213 | + splice_single_frame: null # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded. |
| 214 | + num_frames: 256 # selects the number of frames to use from the video |
| 215 | + sep_token_between_frames: False # TODO: allow usage of separator tokens between frames |
| 216 | + sep_image_conv_front: False |
| 217 | + image_token_len: 576 #lita 1.0 uses 256 |
| 218 | + conv_template: v1 # check `nemo/collections/multimodal/data/neva/conversation.py` |
| 219 | + image_folder: null |
| 220 | + video_folder: null |
| 221 | + image_aspect_ratio: 'pad' # lita 1.0 uses 'square' |
| 222 | + |
| 223 | + # Nsys profiling options |
| 224 | + nsys_profile: |
| 225 | + enabled: False |
| 226 | + start_step: 10 # Global batch to start profiling |
| 227 | + end_step: 10 # Global batch to end profiling |
| 228 | + ranks: [ 0 ] # Global rank IDs to profile |
| 229 | + gen_shape: False # Generate model and kernel details including input shapes |
| 230 | + |
| 231 | + optim: |
| 232 | + name: fused_adam |
| 233 | + lr: 2e-5 |
| 234 | + weight_decay: 0. |
| 235 | + betas: |
| 236 | + - 0.9 |
| 237 | + - 0.95 |
| 238 | + sched: |
| 239 | + name: CosineAnnealing |
| 240 | + warmup_steps: 140 |
| 241 | + constant_steps: 0 |
| 242 | + min_lr: 2e-7 |
0 commit comments