Skip to content

Commit e9523c2

Browse files
committed
Merge branch 'main' into zhanda/debug-accuracy
2 parents 300d1e2 + b721703 commit e9523c2

29 files changed

Lines changed: 3294 additions & 98 deletions

.github/actions/test-template/action.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ runs:
162162
--shm-size=64g \
163163
--env TRANSFORMERS_OFFLINE=0 \
164164
--env HYDRA_FULL_ERROR=1 \
165+
--env HF_HUB_OFFLINE=1 \
165166
--env HF_HOME=/home/TestData/nemo-rl/hf_home \
166167
--env HF_DATASETS_CACHE=/home/TestData/nemo-rl/hf_datasets_cache \
167168
--env NEMO_RL_REPO_DIR=/opt/nemo-rl \

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ hf_datasets_cache/
3434
datasets/
3535
docker/*
3636
!docker/Dockerfile
37+
!docker/Dockerfile.ngc_pytorch
3738
!docker/README.md
3839
wandb/
3940
checkpoints/

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[submodule "3rdparty/NeMo"]
22
path = 3rdparty/NeMo-workspace/NeMo
33
url = https://github.com/NVIDIA/NeMo.git
4-
branch = https://github.com/NVIDIA/NeMo/tree/ashors/rl-qwen3-export
4+
branch = pjin/ashors/rl-qwen3-export
55
shallow = true
66
[submodule "3rdparty/Megatron-LM"]
77
path = 3rdparty/Megatron-LM-workspace/Megatron-LM

docker/Dockerfile.ngc_pytorch

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# This Dockerfile is used to build a Docker image for NeMo RL with the NGC PyTorch base image.
2+
# However, it is still a work in progress and is not yet ready for production use.
3+
#
4+
# Usage:
5+
# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push .
6+
# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push .
7+
# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git
8+
# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push .
9+
#
10+
# If installing new dependencies in the container, then use "uv pip install new-dependency"
11+
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.06-py3
12+
FROM scratch AS nemo-rl
13+
ARG NRL_GIT_REF=main
14+
ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /
15+
16+
FROM ${BASE_IMAGE} AS base
17+
18+
# It is more convenient for users to run as root
19+
USER root
20+
21+
RUN <<"EOF" bash -exu -o pipefail
22+
export DEBIAN_FRONTEND=noninteractive
23+
export TZ=America/Los_Angeles
24+
25+
apt-get update
26+
apt-get install -y --no-install-recommends \
27+
jq \
28+
curl \
29+
git \
30+
rsync \
31+
wget \
32+
less \
33+
vim \
34+
35+
36+
apt-get clean
37+
rm -rf /var/lib/apt/lists/*
38+
EOF
39+
40+
# Install uv at /usr/local/bin in case the root home directory is bind mounted
41+
ARG UV_VERSION=0.7.2
42+
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | XDG_BIN_HOME=/usr/local/bin sh
43+
44+
# Disable usage stats by default for users who are sensitive to sharing usage.
45+
# Users are encouraged to enable if they wish.
46+
ENV RAY_USAGE_STATS_ENABLED=0
47+
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
48+
49+
# Build vLLM from source to use with the NVIDIA PyTorch base image
50+
FROM base AS build_vllm
51+
52+
ARG MAX_JOBS=32
53+
WORKDIR /opt
54+
COPY --from=nemo-rl uv.lock /tmp/uv.lock
55+
56+
RUN <<"EOF" bash -exu
57+
echo "Building vLLM from source for PyTorch base image"
58+
VLLM_VERSION=$(grep -A 1 'name = "vllm"' /tmp/uv.lock | grep 'version =' | sed 's/version = "\(.*\)"/\1/') && \
59+
echo "Building vLLM version: $VLLM_VERSION"
60+
git clone https://github.com/vllm-project/vllm.git
61+
cd vllm
62+
git checkout v$VLLM_VERSION
63+
python use_existing_torch.py
64+
pip install -r requirements/build.txt
65+
pip wheel --no-deps --no-build-isolation -v .
66+
EOF
67+
68+
FROM base AS hermetic
69+
70+
WORKDIR /opt/nemo-rl
71+
72+
# Variables to control the build of TE. If there are issues with parallelization, consider
73+
# setting these to 1.
74+
ARG MAX_JOBS
75+
ARG NVTE_BUILD_THREADS_PER_JOB
76+
77+
ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
78+
ENV UV_CACHE_DIR=/opt/uv_cache
79+
ENV UV_LINK_MODE=copy
80+
81+
# Define the no-install-package arguments for PyTorch base images
82+
ARG BASE_IMAGE
83+
ARG UV_NO_INSTALL_PACKAGES="--no-install-package torch --no-install-package torchvision --no-install-package triton --no-install-package nvidia-cublas-cu12 --no-install-package nvidia-cuda-cupti-cu12 --no-install-package nvidia-cuda-nvrtc-cu12 --no-install-package nvidia-cuda-runtime-cu12 --no-install-package nvidia-cudnn-cu12 --no-install-package nvidia-cufft-cu12 --no-install-package nvidia-cufile-cu12 --no-install-package nvidia-curand-cu12 --no-install-package nvidia-cusolver-cu12 --no-install-package nvidia-cusparse-cu12 --no-install-package nvidia-cusparselt-cu12 --no-install-package nvidia-nccl-cu12 --no-install-package vllm --no-install-package flash-attn --no-install-package transformer-engine --no-install-package transformer-engine-cu12 --no-install-package transformer-engine-torch --no-install-package numpy"
84+
ENV UV_NO_INSTALL_PACKAGES=${UV_NO_INSTALL_PACKAGES}
85+
ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
86+
87+
# First copy only the dependency files
88+
COPY --from=nemo-rl pyproject.toml uv.lock ./
89+
COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/
90+
91+
92+
RUN --mount=type=bind,from=build_vllm,source=/opt/,target=/tmp/build_vllm/ <<"EOF" bash -exu
93+
94+
# uv sync has a more reliable resolver than simple uv pip install which can fail
95+
# The venv is symlinked to avoid bloating the layer size
96+
uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT}
97+
uv pip install --no-cache-dir --no-deps /tmp/build_vllm/vllm/vllm*.whl
98+
uv sync --link-mode symlink --locked --inexact --extra vllm --extra mcore --extra automodel --all-groups --no-install-project $UV_NO_INSTALL_PACKAGES
99+
EOF
100+
101+
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
102+
103+
WORKDIR /opt/nemo-rl
104+
105+
FROM hermetic AS release
106+
107+
ARG NEMO_RL_COMMIT
108+
ARG NVIDIA_BUILD_ID
109+
ARG NVIDIA_BUILD_REF
110+
ENV UV_NO_SYNC=1
111+
ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-<unknown>}
112+
ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>}
113+
ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
114+
ENV NEMO_RL_PY_EXECUTABLES_SYSTEM=1
115+
# The 25.06 Pytorch container is not compatible with vllm standalone compile so we disable it
116+
ENV VLLM_USE_STANDALONE_COMPILE=0
117+
LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
118+
LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
119+
120+
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
121+
122+
# Copy in source from build context (defaults to cloned repo, can be overridden)
123+
COPY --from=nemo-rl . /opt/nemo-rl
124+
# Unshallow the repo to get the full history (in the case it was from the scratch layer).
125+
# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
126+
# so do a quick check before trying to unshallow.
127+
RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
128+
RUN UV_LINK_MODE=symlink uv sync --locked --inexact $UV_NO_INSTALL_PACKAGES

docs/adding-new-models.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,42 @@ uv run --extra vllm tools/model_diagnostics/2.long_generation_decode_vs_prefill.
152152
# ...
153153
# [Qwen/Qwen2.5-1.5B] ALL GOOD!
154154
```
155+
156+
## [3.check_hf_model_embeddings_untrained.py](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/3.check_hf_model_embeddings_untrained.py)
157+
158+
Detects untrained or improperly initialized Hugging Face model embeddings by scanning for near-zero rows and rows with near-identical values in both input and output embeddings. The script also reports whether word embeddings are tied and summarizes basic statistics.
159+
160+
```sh
161+
# Example run
162+
uv run --extra mcore tools/model_diagnostics/3.check_hf_model_embeddings_untrained.py --model nvidia/Nemotron-H-8B-Base-8K
163+
164+
# ....
165+
#================================================================================
166+
#EMBEDDING SUMMARIES
167+
#================================================================================
168+
#
169+
#--- Input Embeddings Summary ---
170+
#Shape: torch.Size([131072, 4096]), Dtype: torch.bfloat16
171+
#Near-zero embeddings (abs < 1.00e-10): 1039/131072 (0.8%)
172+
# Indices: 0-1,3-999,1192-1193,1245-1255,55014,77579,81772,81819,82312,82500,82725,82737,82977,84020,84121,84521,84794,85015,86409,87411,89412,90320,91368,94485,96385,104097,108262,112147,112327,112497,114755
173+
#Identical embeddings (std < 1.00e-08): 1041/131072 (0.8%)
174+
# Indices: 0-1,3-999,1192-1193,1245-1255,55014,77579,81772,81819,82312,82500,82725,82737,82977,83855,84020,84121,84521,84794,85015,86409,87411,89412,90320,91368,94485,96385,101707,104097,108262,112147,112327,112497,114755
175+
#Statistics: mean_abs=0.007874, max_abs=0.196289, std_range=[0.000000, 0.015442]
176+
#⚠️ POTENTIAL ISSUES: 1039 near-zero embeddings, 1041 identical embeddings
177+
#
178+
#--- Output Embeddings Summary (Tied: False) ---
179+
#Shape: torch.Size([131072, 4096]), Dtype: torch.bfloat16
180+
#Near-zero embeddings (abs < 1.00e-10): 0/131072 (0.0%)
181+
#Identical embeddings (std < 1.00e-08): 0/131072 (0.0%)
182+
#Statistics: mean_abs=0.006775, max_abs=0.200195, std_range=[0.004089, 0.021240]
183+
#✅ No obvious untrained patterns detected
184+
#
185+
#=== Final Summary ===
186+
#Model: nvidia/Nemotron-H-8B-Base-8K
187+
#Analysis complete.
188+
```
189+
190+
- Thresholds can be adjusted via flags:
191+
- `--near-zero-threshold` (default: `1e-10`)
192+
- `--identical-threshold` (default: `1e-8`)
193+
- If any near-zero or identical rows are reported, the model may have issues of numerical instability (e.g., inf grad norms) during post-training if any of these problematic tokens are encountered. We have observed this happening when special tokens are reserved in the tokenizer and embedding, but none are encountered during pre-training. It may help to initialize these embeddings similar to how they were initialize during pre-training.

docs/design-docs/generation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ A key design principle for generation backends is that they process tokens direc
6262

6363
## VLLM Backend
6464

65-
The VLLM backend (`models/generation/vllm.py`) implements the {py:class}`GenerationInterface <nemo_rl.models.generation.interfaces.GenerationInterface>` to provide efficient text generation using the VLLM library, which is optimized for large language models.
65+
The VLLM backend (`models/generation/vllm/vllm_generation.py`) implements the {py:class}`GenerationInterface <nemo_rl.models.generation.interfaces.GenerationInterface>` to provide efficient text generation using the VLLM library, which is optimized for large language models.
6666

6767
### VllmGeneration Class
6868

docs/guides/grpo.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ This Policy object holds a [RayWorkerGroup](../../nemo_rl/distributed/worker_gro
107107

108108
## Fast Generation
109109

110-
We support vLLM through the [VllmGeneration](../../nemo_rl/models/generation/vllm.py) class right now.
110+
We support vLLM through the [VllmGeneration](../../nemo_rl/models/generation/vllm/vllm_generation.py) class right now.
111111

112112
The function [grpo_train](../../nemo_rl/algorithms/grpo.py) contains the core GRPO training loop.
113113

examples/configs/grpo_math_1B.yaml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ policy:
4141
logprob_batch_size: 4
4242
max_total_sequence_length: 512
4343
precision: "bfloat16"
44+
logprob_chunk_size: null
4445

4546
dtensor_cfg:
4647
enabled: true
@@ -53,6 +54,65 @@ policy:
5354

5455
megatron_cfg:
5556
enabled: false
57+
empty_unused_memory_level: 0
58+
activation_checkpointing: false
59+
converter_type: "Qwen2ForCausalLM"
60+
tensor_model_parallel_size: 1
61+
expert_tensor_parallel_size: 1
62+
expert_model_parallel_size: 1
63+
pipeline_model_parallel_size: 1
64+
num_layers_in_first_pipeline_stage: null
65+
num_layers_in_last_pipeline_stage: null
66+
context_parallel_size: 1
67+
pipeline_dtype: ${policy.precision}
68+
sequence_parallel: false
69+
freeze_moe_router: true
70+
moe_router_dtype: "fp64"
71+
moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
72+
moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
73+
#gives ~20% training perf speedup with sequence packing
74+
apply_rope_fusion: True
75+
defer_fp32_logits: null
76+
77+
optimizer:
78+
optimizer: "adam"
79+
lr: 5.0e-6
80+
min_lr: 5.0e-7
81+
weight_decay: 0.01
82+
bf16: true
83+
fp16: false
84+
params_dtype: "float32"
85+
86+
#adam
87+
adam_beta1: 0.9
88+
adam_beta2: 0.999
89+
adam_eps: 1e-8
90+
91+
#sgd
92+
sgd_momentum: 0.9
93+
94+
#distributed optimizer
95+
use_distributed_optimizer: true
96+
use_precision_aware_optimizer: true
97+
98+
clip_grad: ${policy.max_grad_norm}
99+
100+
scheduler:
101+
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
102+
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
103+
weight_decay_incr_style: "constant"
104+
lr_decay_style: "constant"
105+
lr_decay_iters: null
106+
lr_warmup_iters: 13
107+
lr_warmup_init: 5.0e-7
108+
109+
distributed_data_parallel_config:
110+
grad_reduce_in_fp32: false
111+
overlap_grad_reduce: true
112+
overlap_param_gather: true
113+
average_in_collective: true
114+
use_custom_fsdp: false
115+
data_parallel_sharding_strategy: "optim_grads_params"
56116

57117
# See docs/design-docs/sequence-packing-and-dynamic-batching.md
58118
# for more details on dynamic batching and sequence packing.

examples/configs/grpo_math_qwen30ba3b_megatron.yaml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,6 @@ policy:
5656
lr_warmup_iters: 13
5757
lr_warmup_init: 3.0e-8
5858

59-
env_vars:
60-
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
61-
6259
generation:
6360
backend: "vllm"
6461
max_new_tokens: ${policy.max_total_sequence_length}

0 commit comments

Comments
 (0)