Skip to content
Merged
8 changes: 4 additions & 4 deletions .github/workflows/e2e_gsm8k_megatron.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
name: e2e_gsm8k_megatron
# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0

on:
# Trigger the workflow on push or pull request,
Expand Down Expand Up @@ -33,7 +34,7 @@ jobs:
NO_PROXY: "localhost,127.0.0.1"
HF_HUB_ENABLE_HF_TRANSFER: 1
container:
image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.5
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
Expand All @@ -49,11 +50,10 @@ jobs:
- name: Running gsm8k e2e training tests on 8 L20 GPUs with Megatron (Deepseek)
run: |
ray stop --force
[ ! -d "$HOME/Megatron-LM" ] && git clone -b core_v0.4.0_verl https://github.com/eric-haibin-lin/Megatron-LM $HOME/Megatron-LM
export PYTHONPATH=$PYTHONPATH:$HOME/Megatron-LM
export PYTHONPATH=$PYTHONPATH:/opt/nvidia/Megatron-LM
bash tests/e2e/run_deepseek_megatron.sh
- name: Running gsm8k e2e training tests on 8 L20 GPUs with Megatron (Qwen)
run: |
ray stop --force
export PYTHONPATH=$PYTHONPATH:$HOME/Megatron-LM
export PYTHONPATH=$PYTHONPATH:/opt/nvidia/Megatron-LM
bash tests/e2e/run_qwen_megatron.sh
9 changes: 9 additions & 0 deletions docker/Dockerfile.megatron
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3

RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable

RUN cd /opt/nvidia && git clone --single-branch --branch core_r0.11.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM

# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
# unset for now
RUN cd /opt/nvidia/Megatron-LM && pip3 install --no-deps -e .
4 changes: 3 additions & 1 deletion verl/models/llama/megatron/checkpoint_utils/llama_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could u keep the v0.4 patch file for now in case others are want to run v0.4 for comparison. thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can remove the v0.4 patch after the next stable release of verl

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK,I will add that back

# limitations under the License.

import importlib
from packaging.version import Version
import torch
import time
from typing import Dict, Any, Callable, Optional
Expand Down Expand Up @@ -53,7 +55,7 @@ def load_state_dict_to_megatron_llama(state_dict, wrapped_models, config, params
"""
import megatron
from megatron.core import mpu
from megatron.utils import print_rank_0, unwrap_model
from megatron.training.utils import print_rank_0, unwrap_model
from megatron.core.transformer.module import Float16Module
from megatron.core import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
Expand Down
16 changes: 10 additions & 6 deletions verl/models/llama/megatron/checkpoint_utils/llama_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import megatron
from megatron.core import mpu
from megatron.utils import print_rank_0, unwrap_model
from megatron.model import Float16Module
from megatron.model import DistributedDataParallel as LocalDDP
import importlib
from packaging.version import Version
from torch.nn.parallel import DistributedDataParallel as torchDDP
import torch
import time
from typing import Optional
import torch.distributed as dist

import megatron
from megatron import get_args
from megatron.core import mpu
from megatron.core.transformer.module import Float16Module
from megatron.core.distributed import DistributedDataParallel as LocalDDP

from megatron.training.utils import print_rank_0, unwrap_model
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We copied several module and util functions from the Megatron-LM package into the megatron_utils.py.
It would be better if we can remove importing from outside megatron.core

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, fixing this



def _megatron_calc_global_rank(tp_rank: int = 0, dp_rank: int = 0, pp_rank: int = 0):
Expand Down Expand Up @@ -77,7 +81,7 @@ def merge_megatron_ckpt_llama(wrapped_models, config, is_value_model=False, dtyp
"""Merge sharded parameters of a Megatron module into a merged checkpoint.

Args:
wrapped_models (list of megatron.model.DistributedDataParallel):
wrapped_models (list of megatron.core.distributed.DistributedDataParallel):
The local DDP wrapped megatron modules.
dtype (str or None):
The data type of state_dict. if None, the data type of the original parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def load_state_dict_to_megatron_qwen2(state_dict, wrapped_models, config, params
"""
import megatron
from megatron.core import mpu
from megatron.utils import print_rank_0, unwrap_model
from megatron.training.utils import print_rank_0, unwrap_model
from megatron.core.transformer.module import Float16Module
from megatron.core import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
Expand Down
8 changes: 4 additions & 4 deletions verl/models/qwen2/megatron/checkpoint_utils/qwen2_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@

import megatron
from megatron.core import mpu
from megatron.utils import print_rank_0, unwrap_model
from megatron.model import Float16Module
from megatron.model import DistributedDataParallel as LocalDDP
from megatron.training.utils import print_rank_0, unwrap_model
from megatron.core.transformer.module import Float16Module
from megatron.core.distributed import DistributedDataParallel as LocalDDP
from torch.nn.parallel import DistributedDataParallel as torchDDP
import torch
import time
Expand Down Expand Up @@ -77,7 +77,7 @@ def merge_megatron_ckpt_llama(wrapped_models, config, is_value_model=False, dtyp
"""Merge sharded parameters of a Megatron module into a merged checkpoint.

Args:
wrapped_modelss (list of megatron.model.DistributedDataParallel):
wrapped_modelss (list of megatron.core.distributed.DistributedDataParallel):
The local DDP wrapped megatron modules.
dtype (str or None):
The data type of state_dict. if None, the data type of the original parameters
Expand Down
73 changes: 11 additions & 62 deletions verl/utils/megatron/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
from packaging.version import Version

from apex.optimizers import FusedAdam as Adam
from apex.optimizers import FusedSGD as SGD
from megatron.optimizer.distrib_optimizer import DistributedOptimizer
from megatron.optimizer.grad_scaler import ConstantGradScaler, DynamicGradScaler
from megatron.optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
from megatron.optimizer import get_param_groups

from verl.utils.megatron.optimizer_config import OptimizerConfig
from megatron.core.optimizer import OptimizerConfig

from megatron.core.optimizer import get_megatron_optimizer as get_megatron_optimizer_native


def get_megatron_optimizer(
Expand All @@ -33,60 +34,8 @@ def get_megatron_optimizer(
overlap_param_gather=False # add for verl
):
# Base optimizer.
param_groups = get_param_groups(model, no_weight_decay_cond, scale_lr_cond, lr_mult)

if config.optimizer == 'adam':
optimizer = Adam(param_groups,
lr=config.lr,
weight_decay=config.weight_decay,
betas=(config.adam_beta1, config.adam_beta2),
eps=config.adam_eps)
elif config.optimizer == 'sgd':
optimizer = SGD(param_groups, lr=config.lr, weight_decay=config.weight_decay, momentum=config.sgd_momentum)
else:
raise Exception('{} optimizer is not supported.'.format(config.optimizer))

# Determine whether the params have main-grad field.
params_have_main_grad = True

# Mixed precision optimizer.
# - Note: both the Float16Optimizer and the DistributedOptimizer inherit
# from the MixedPrecisionOptimizer, which manages any optimizer where
# the model params and main params are distinct.
if config.fp16 or config.bf16 or config.use_distributed_optimizer:

# Grad scaler:
# if loss-scale is provided, instantiate the constant scaler.
# if we are using fp16 and loss-scale is not present, use a
# dynamic scaler.
# otherwise we are running in bf16 with no loss-scale so
# leave it as None.
grad_scaler = None

# Constant loss scale.
if config.loss_scale:
grad_scaler = ConstantGradScaler(config.loss_scale)

# Dynamic loss scale.
else:
if config.fp16:
grad_scaler = DynamicGradScaler(initial_scale=config.initial_loss_scale,
min_scale=config.min_loss_scale,
growth_factor=2.0,
backoff_factor=0.5,
growth_interval=config.loss_scale_window,
hysteresis=config.hysteresis)

# Megatron optimizer.
if config.use_distributed_optimizer:
return DistributedOptimizer(optimizer, config.clip_grad, config.log_num_zeros_in_grad,
check_for_nan_in_loss_and_grad, params_have_main_grad, config.fp16, config.bf16,
config.params_dtype, grad_scaler, model, overlap_param_gather)
else:
return Float16OptimizerWithFloat16Params(optimizer, config.clip_grad, config.log_num_zeros_in_grad,
check_for_nan_in_loss_and_grad, params_have_main_grad, config.fp16,
config.bf16, config.params_dtype, grad_scaler, model)

# FP32.
return FP32Optimizer(optimizer, config.clip_grad, config.log_num_zeros_in_grad, check_for_nan_in_loss_and_grad,
params_have_main_grad, model)
return get_megatron_optimizer_native(config=config,
model_chunks=model,
no_weight_decay_cond=no_weight_decay_cond,
scale_lr_cond=scale_lr_cond,
lr_mult=lr_mult)
129 changes: 0 additions & 129 deletions verl/utils/megatron/optimizer_config.py

This file was deleted.

Loading