Skip to content

Commit 57eebf8

Browse files
SlyneSlyne Dengyaoyu-33
authored andcommitted
LITA integration (#9578)
* add lita Signed-off-by: Slyne Deng <slyned@nvidia.com> * Apply isort and black reformatting Signed-off-by: Slyne <Slyne@users.noreply.github.com> * add part of the tutorial and fix format Signed-off-by: slyne deng <slyned@nvidia.com> * add tutorial Signed-off-by: slyne deng <slyned@nvidia.com> * fix Tutorial ckpt conversion Signed-off-by: slyne deng <slyned@nvidia.com> * Apply isort and black reformatting Signed-off-by: Slyne <Slyne@users.noreply.github.com> * update cicd Signed-off-by: Slyne Deng <slyned@nvidia.com> * add to CIICD test Signed-off-by: Slyne Deng <slyned@nvidia.com> * changes based on review comments Signed-off-by: Slyne Deng <slyned@nvidia.com> * fix bot warning Signed-off-by: Slyne Deng <slyned@nvidia.com> * update cicd main Signed-off-by: Slyne Deng <slyned@nvidia.com> * fix cicd ckpt conversion Signed-off-by: Slyne Deng <slyned@nvidia.com> --------- Signed-off-by: Slyne Deng <slyned@nvidia.com> Signed-off-by: Slyne <Slyne@users.noreply.github.com> Signed-off-by: slyne deng <slyned@nvidia.com> Co-authored-by: Slyne Deng <slyned@nvidia.com> Co-authored-by: Slyne <Slyne@users.noreply.github.com> Co-authored-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
1 parent 633343e commit 57eebf8

File tree

22 files changed

+3547
-110
lines changed

22 files changed

+3547
-110
lines changed

.github/workflows/cicd-main.yml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,28 @@ jobs:
179179
rm -f /home/TestData/nlp/megatron_gpt/falcon-ci-hf/falcon_ci.nemo
180180
AFTER_SCRIPT: |
181181
rm -rf /home/TestData/nlp/megatron_gpt/falcon-ci-hf/model_weights
182-
182+
183+
# L2: Community llava multimodal Checkpoints tests
184+
L2_Community_vita_Checkpoints_tests_Llama3:
185+
needs: [cicd-test-container-setup]
186+
uses: ./.github/workflows/_test_template.yml
187+
with:
188+
RUNNER: self-hosted-azure
189+
SCRIPT: |
190+
export PYTHONPATH=/home/TestData/multimodal/video_neva/LLaVA:$PYTHONPATH
191+
CUDA_VISIBLE_DEVICES=0 python examples/multimodal/multimodal_llm/neva/convert_llava_to_neva.py \
192+
--in-file /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/llm \
193+
--mm-projector-ckpt-dir /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/mm_projector \
194+
--mm-vision-tower /home/TestData/multimodal/video_neva/Llama-3-VILA1.5-8B/vision_tower \
195+
--tokenizer-model /home/TestData/multimodal/video_neva/vita-tokenizer/ \
196+
--config-file vita_config.yaml \
197+
--out-file=/home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo \
198+
--model-type VITA \
199+
--conv-template llama_3
200+
AFTER_SCRIPT: |
201+
rm -f /home/TestData/multimodal/video_neva/llama3-ci-hf/llama3_ci.nemo
202+
rm -rf /home/TestData/multimodal/video_neva/llama3-ci-hf/model_weights
203+
183204
# this test is using a 7B model which is too large for GitHub CI
184205
# replace the model in this test with a toy model or move the test
185206
# to the nightly CI
@@ -4535,6 +4556,7 @@ jobs:
45354556
- L2_Community_LLM_Checkpoints_tests_Llama
45364557
- L2_Community_LLM_Checkpoints_tests_StarCoder
45374558
- L2_Community_LLM_Checkpoints_tests_Falcon
4559+
- L2_Community_vita_Checkpoints_tests_Llama3
45384560
#- OPTIONAL_L2_Community_LLM_Checkpoints_tests_Baichuan2
45394561
- ASR_dev_run_Speech_to_Text
45404562
- ASR_dev_run_Speech_to_Text_WPE_-_CitriNet
Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
name: nemo_video_lita_neva
2+
restore_from_path: null # used when starting from a .nemo file
3+
4+
trainer:
5+
devices: 1
6+
num_nodes: 1
7+
accelerator: gpu
8+
precision: bf16
9+
logger: False # logger provided by exp_manager
10+
enable_checkpointing: False
11+
use_distributed_sampler: False
12+
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
13+
max_steps: 10000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
14+
log_every_n_steps: 10
15+
val_check_interval: 100
16+
check_val_every_n_epoch: null
17+
limit_val_batches: 50
18+
limit_test_batches: 500
19+
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
20+
gradient_clip_val: 1.0
21+
benchmark: False
22+
enable_model_summary: False # default PTL callback for this does not support model parallelism, instead we log manually
23+
24+
exp_manager:
25+
explicit_log_dir: null
26+
exp_dir: null
27+
name: nemo_video_neva_lita
28+
create_wandb_logger: True
29+
wandb_logger_kwargs:
30+
project: null
31+
name: null
32+
resume_if_exists: True
33+
resume_ignore_no_checkpoint: True
34+
resume_from_checkpoint: ${model.resume_from_checkpoint}
35+
create_checkpoint_callback: True
36+
checkpoint_callback_params:
37+
monitor: val_loss
38+
save_top_k: 5
39+
mode: min
40+
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
41+
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
42+
filename: 'megatron_clip--{val_loss:.2f}-{step}-{consumed_samples}'
43+
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
44+
ema:
45+
enable: False
46+
decay: 0.9999
47+
validate_original_weights: False
48+
every_n_steps: 1
49+
cpu_offload: False
50+
51+
model:
52+
precision: ${trainer.precision}
53+
54+
# specify micro_batch_size, global_batch_size, and model parallelism
55+
# gradient accumulation will be done automatically based on data_parallel_size
56+
57+
# Batch size guideline for different types of dataset
58+
micro_batch_size: 1 # limited by GPU memory
59+
global_batch_size: 2 # will use more micro batches to reach global batch size
60+
61+
tensor_model_parallel_size: 1 # intra-layer model parallelism
62+
pipeline_model_parallel_size: 1 # inter-layer model parallelism
63+
context_parallel_size: 1 # kqv model parallelism
64+
virtual_pipeline_model_parallel_size: null # interleaved pipeline
65+
66+
restore_from_path: null # used in fine-tuning
67+
68+
# Multimodal configs
69+
mm_cfg:
70+
llm:
71+
from_pretrained: null #path to nemo checkpoint
72+
freeze: False
73+
model_type: llama_2 # `nvgpt` or `llama_2` supported
74+
vision_encoder:
75+
from_pretrained: "Lin-Chen/ShareGPT4V-13B_Pretrained_vit-large336-l12" # huggingface path or name
76+
from_hf: True
77+
crop_size: [336, 336]
78+
patch_dim: 14
79+
hidden_size: 1024 # could be found from model but tricky in code
80+
vision_select_layer: -2 # default to the last layer
81+
class_token_length: 1
82+
freeze: True
83+
lita:
84+
lita_video_arch: 'temporal_all_resolution' # ['temporal_spatial_pool', 'temporal_spatial', 'temporal_all_resolution'] 'temporal_spatial_pool' is used in lita1.0
85+
visual_token_format: 'im_vid_start_end' # ["v1", "im_vid_start_end"] v1 means do nothing, im_vid_start_end means add image and video start and end tokens around spatial and temporal tokens
86+
sample_frames: 4 # for lita 1.5 sample_frames are used for spatial tokens, and spatial tokens will no longer do pooling and instead, it will use full tokens
87+
use_lita: True
88+
pretrain_mm_mlp_adapter: null # path to pretrained mm adapter
89+
mm_mlp_adapter_type: mlp2x_gelu # ['linear', 'mlp2x_gelu', 'mlp_downsample']
90+
use_im_start_end: False
91+
92+
# ========LORA configs start=======
93+
#peft:
94+
# peft_scheme: "lora"
95+
# restore_from_path: null
96+
# lora_tuning:
97+
# adapter_dim: 128
98+
# alpha: 256
99+
# target_modules: ['all']
100+
# adapter_dropout: 0.0
101+
# column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
102+
# row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
103+
# layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
104+
# weight_tying: False
105+
# position_embedding_strategy: null # used only when weight_tying is True
106+
# =======LORA configs end=======
107+
108+
# LLM configs
109+
# use GPTModel from megatron.core
110+
mcore_gpt: True
111+
112+
# model architecture
113+
encoder_seq_length: 4096
114+
max_position_embeddings: ${.encoder_seq_length}
115+
position_embedding_type: rope
116+
num_layers: 32
117+
hidden_size: 4096
118+
ffn_hidden_size: 11008 # Transformer FFN hidden size. Usually 4 * hidden_size.
119+
num_attention_heads: 32
120+
init_method_std: 0.014 # Standard deviation of the zero mean normal distribution used for weight initialization.')
121+
use_scaled_init_method: True # use scaled residuals initialization
122+
hidden_dropout: 0.0 # Dropout probability for hidden state transformer.
123+
attention_dropout: 0.0 # Dropout probability for attention
124+
ffn_dropout: 0.0 # Dropout probability in the feed-forward layer.
125+
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
126+
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
127+
normalization: 'rmsnorm' # Normalization layer to use. Options are 'layernorm', 'rmsnorm'
128+
layernorm_epsilon: 1e-5
129+
do_layer_norm_weight_decay: False # True means weight decay on all params
130+
make_vocab_size_divisible_by: 16 # Pad the vocab size to be divisible by this value for computation efficiency.
131+
pre_process: True # add embedding
132+
post_process: True # add pooler
133+
persist_layer_norm: True # Use of persistent fused layer norm kernel.
134+
bias: False # Whether to use bias terms in all weight matrices.
135+
activation: 'fast-swiglu' # Options ['gelu', 'geglu', 'swiglu', 'reglu', 'squared-relu', 'fast-geglu', 'fast-swiglu', 'fast-reglu']
136+
headscale: False # Whether to learn extra parameters that scale the output of the each self-attention head.
137+
transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
138+
normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
139+
rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
140+
attention_type: 'multihead' # Attention type. Options ['multihead']
141+
share_embeddings_and_output_weights: False # Share embedding and output layer weights.
142+
overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
143+
batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
144+
seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
145+
num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
146+
147+
## Activation Checkpointing
148+
activations_checkpoint_granularity: null # 'selective' or 'full'
149+
activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
150+
activations_checkpoint_num_layers: null # not used with 'selective'
151+
num_micro_batches_with_partial_activation_checkpoints: null
152+
activations_checkpoint_layers_per_pipeline: null
153+
sequence_parallel: False
154+
155+
# precision
156+
native_amp_init_scale: 4294967296 # 2 ** 32
157+
native_amp_growth_interval: 1000
158+
hysteresis: 2 # Gradient scale hysteresis
159+
fp32_residual_connection: False # Move residual connections to fp32
160+
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
161+
162+
# model fusions
163+
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
164+
bias_dropout_add_fusion: False # Use a kernel that fuses the bias addition, dropout and residual connection addition.
165+
166+
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
167+
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
168+
gradient_accumulation_fusion: False # Fuse weight gradient accumulation to GEMMs. Only used with pipeline parallelism.
169+
openai_gelu: False
170+
bias_activation_fusion: False
171+
megatron_legacy: False
172+
173+
transformer_engine: True
174+
fp8: False # enables fp8 in TransformerLayer forward
175+
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
176+
fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID
177+
fp8_margin: 0 # scaling margin
178+
fp8_interval: 1 # scaling update interval
179+
fp8_amax_history_len: 1 # Number of steps for which amax history is recorded per tensor
180+
fp8_amax_compute_algo: most_recent # 'most_recent' or 'max'. Algorithm for computing amax from history
181+
use_emha: False # Use fused multi-head attention for large sequence-length. Note this is not yet supported. Please set to False.
182+
183+
# Megatron O2-style half-precision
184+
megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters
185+
async_grad_allreduce: False
186+
grad_allreduce_chunk_size_mb: 125
187+
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
188+
189+
# miscellaneous
190+
seed: 1234
191+
resume_from_checkpoint: null # manually set the checkpoint file to load from
192+
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
193+
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
194+
195+
tokenizer:
196+
library: 'sentencepiece'
197+
type: null
198+
model: /ws/converted_nemo_model/tokenizer_1_5.model
199+
vocab_file: null
200+
merge_file: null
201+
delimiter: null # only used for tabular tokenizer
202+
sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
203+
additional_special_tokens: null # ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>"]
204+
205+
data:
206+
packed_sequence: False
207+
num_workers: 8
208+
dataloader_type: cyclic
209+
data_path: null
210+
lazy_preprocess: True
211+
is_multimodal: True
212+
media_type: video # currently supported: image or video
213+
splice_single_frame: null # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded.
214+
num_frames: 256 # selects the number of frames to use from the video
215+
sep_token_between_frames: False # TODO: allow usage of separator tokens between frames
216+
sep_image_conv_front: False
217+
image_token_len: 576 #lita 1.0 uses 256
218+
conv_template: v1 # check `nemo/collections/multimodal/data/neva/conversation.py`
219+
image_folder: null
220+
video_folder: null
221+
image_aspect_ratio: 'pad' # lita 1.0 uses 'square'
222+
223+
# Nsys profiling options
224+
nsys_profile:
225+
enabled: False
226+
start_step: 10 # Global batch to start profiling
227+
end_step: 10 # Global batch to end profiling
228+
ranks: [ 0 ] # Global rank IDs to profile
229+
gen_shape: False # Generate model and kernel details including input shapes
230+
231+
optim:
232+
name: fused_adam
233+
lr: 2e-5
234+
weight_decay: 0.
235+
betas:
236+
- 0.9
237+
- 0.95
238+
sched:
239+
name: CosineAnnealing
240+
warmup_steps: 140
241+
constant_steps: 0
242+
min_lr: 2e-7

0 commit comments

Comments
 (0)