NVIDIA-NeMo
diff --git a/‎.github/actions/test-template/action.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/actions/test-template/action.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.ngc_pytorch‎
Lines changed: 128 additions & 0 deletions b/‎docker/Dockerfile.ngc_pytorch‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎docs/adding-new-models.md‎
Lines changed: 39 additions & 0 deletions b/‎docs/adding-new-models.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎docs/design-docs/generation.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/design-docs/generation.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/guides/grpo.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/guides/grpo.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 60 additions & 0 deletions b/‎examples/configs/grpo_math_1B.yaml‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎examples/configs/grpo_math_qwen30ba3b_megatron.yaml‎
Lines changed: 0 additions & 3 deletions b/‎examples/configs/grpo_math_qwen30ba3b_megatron.yaml‎
Lines changed: 0 additions & 3 deletions
@@ -162,6 +162,7 @@ runs:
           --shm-size=64g \
           --env TRANSFORMERS_OFFLINE=0 \
           --env HYDRA_FULL_ERROR=1 \
+          --env HF_HUB_OFFLINE=1 \
           --env HF_HOME=/home/TestData/nemo-rl/hf_home \
           --env HF_DATASETS_CACHE=/home/TestData/nemo-rl/hf_datasets_cache \
           --env NEMO_RL_REPO_DIR=/opt/nemo-rl \
 
@@ -34,6 +34,7 @@ hf_datasets_cache/
 datasets/
 docker/*
 !docker/Dockerfile
+!docker/Dockerfile.ngc_pytorch
 !docker/README.md
 wandb/
 checkpoints/
 
@@ -1,7 +1,7 @@
 [submodule "3rdparty/NeMo"]
 	path = 3rdparty/NeMo-workspace/NeMo
 	url = https://github.com/NVIDIA/NeMo.git
-	branch = https://github.com/NVIDIA/NeMo/tree/ashors/rl-qwen3-export
+	branch = pjin/ashors/rl-qwen3-export
 	shallow = true
 [submodule "3rdparty/Megatron-LM"]
 	path = 3rdparty/Megatron-LM-workspace/Megatron-LM
 
@@ -0,0 +1,128 @@
+# This Dockerfile is used to build a Docker image for NeMo RL with the NGC PyTorch base image.
+# However, it is still a work in progress and is not yet ready for production use.
+#
+# Usage:
+# Self-contained build (default: builds from main): docker buildx build -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push .
+# Self-contained build (specific git ref): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push .
+# Self-contained build (remote NeMo RL source; no need for a local clone of NeMo RL): docker buildx build -f docker/Dockerfile.ngc_pytorch --build-arg NRL_GIT_REF=r0.3.0 --tag <registry>/nemo-rl:r0.3.0 --push https://github.com/NVIDIA-NeMo/RL.git
+# Local NeMo RL source override: docker buildx build --build-context nemo-rl=. -f docker/Dockerfile.ngc_pytorch --tag <registry>/nemo-rl:latest --push .
+#
+# If installing new dependencies in the container, then use "uv pip install new-dependency"
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.06-py3
+FROM scratch AS nemo-rl
+ARG NRL_GIT_REF=main
+ADD --keep-git-dir=true https://github.com/NVIDIA-NeMo/RL.git#${NRL_GIT_REF} /
+
+FROM ${BASE_IMAGE} AS base
+
+# It is more convenient for users to run as root
+USER root
+
+RUN <<"EOF" bash -exu -o pipefail
+export DEBIAN_FRONTEND=noninteractive
+export TZ=America/Los_Angeles
+
+apt-get update
+apt-get install -y --no-install-recommends \
+    jq \
+    curl \
+    git \
+    rsync \
+    wget \
+    less \
+    vim \
+
+
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+EOF
+
+# Install uv at /usr/local/bin in case the root home directory is bind mounted
+ARG UV_VERSION=0.7.2
+RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | XDG_BIN_HOME=/usr/local/bin sh
+
+# Disable usage stats by default for users who are sensitive to sharing usage.
+# Users are encouraged to enable if they wish.
+ENV RAY_USAGE_STATS_ENABLED=0
+ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
+
+# Build vLLM from source to use with the NVIDIA PyTorch base image
+FROM base AS build_vllm
+
+ARG MAX_JOBS=32
+WORKDIR /opt
+COPY --from=nemo-rl uv.lock /tmp/uv.lock
+
+RUN <<"EOF" bash -exu
+echo "Building vLLM from source for PyTorch base image"
+VLLM_VERSION=$(grep -A 1 'name = "vllm"' /tmp/uv.lock | grep 'version =' | sed 's/version = "\(.*\)"/\1/') && \
+echo "Building vLLM version: $VLLM_VERSION"
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+git checkout v$VLLM_VERSION
+python use_existing_torch.py
+pip install -r requirements/build.txt
+pip wheel --no-deps --no-build-isolation -v .
+EOF
+
+FROM base AS hermetic
+
+WORKDIR /opt/nemo-rl
+
+# Variables to control the build of TE. If there are issues with parallelization, consider
+# setting these to 1.
+ARG MAX_JOBS
+ARG NVTE_BUILD_THREADS_PER_JOB
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
+ENV UV_CACHE_DIR=/opt/uv_cache
+ENV UV_LINK_MODE=copy
+
+# Define the no-install-package arguments for PyTorch base images
+ARG BASE_IMAGE
+ARG UV_NO_INSTALL_PACKAGES="--no-install-package torch --no-install-package torchvision --no-install-package triton --no-install-package nvidia-cublas-cu12 --no-install-package nvidia-cuda-cupti-cu12 --no-install-package nvidia-cuda-nvrtc-cu12 --no-install-package nvidia-cuda-runtime-cu12 --no-install-package nvidia-cudnn-cu12 --no-install-package nvidia-cufft-cu12 --no-install-package nvidia-cufile-cu12 --no-install-package nvidia-curand-cu12 --no-install-package nvidia-cusolver-cu12 --no-install-package nvidia-cusparse-cu12 --no-install-package nvidia-cusparselt-cu12 --no-install-package nvidia-nccl-cu12 --no-install-package vllm --no-install-package flash-attn --no-install-package transformer-engine --no-install-package transformer-engine-cu12 --no-install-package transformer-engine-torch --no-install-package numpy"
+ENV UV_NO_INSTALL_PACKAGES=${UV_NO_INSTALL_PACKAGES}
+ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
+
+# First copy only the dependency files
+COPY --from=nemo-rl pyproject.toml uv.lock ./
+COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/
+
+
+RUN --mount=type=bind,from=build_vllm,source=/opt/,target=/tmp/build_vllm/ <<"EOF" bash -exu
+
+# uv sync has a more reliable resolver than simple uv pip install which can fail
+# The venv is symlinked to avoid bloating the layer size
+uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT}
+uv pip install --no-cache-dir --no-deps /tmp/build_vllm/vllm/vllm*.whl
+uv sync --link-mode symlink --locked --inexact --extra vllm --extra mcore --extra automodel --all-groups --no-install-project $UV_NO_INSTALL_PACKAGES
+EOF
+
+ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
+
+WORKDIR /opt/nemo-rl
+
+FROM hermetic AS release
+
+ARG NEMO_RL_COMMIT
+ARG NVIDIA_BUILD_ID
+ARG NVIDIA_BUILD_REF
+ENV UV_NO_SYNC=1
+ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-<unknown>}
+ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-<unknown>}
+ENV NVIDIA_BUILD_REF=${NVIDIA_BUILD_REF:-<unknown>}
+ENV NEMO_RL_PY_EXECUTABLES_SYSTEM=1
+# The 25.06 Pytorch container is not compatible with vllm standalone compile so we disable it
+ENV VLLM_USE_STANDALONE_COMPILE=0
+LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
+LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
+
+ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
+
+# Copy in source from build context (defaults to cloned repo, can be overridden)
+COPY --from=nemo-rl . /opt/nemo-rl
+# Unshallow the repo to get the full history (in the case it was from the scratch layer).
+# Potentially not necessary if the repo is passed in as a complete repository (w/ full git history),
+# so do a quick check before trying to unshallow.
+RUN git rev-parse --is-shallow-repository | grep -q true && git fetch --unshallow || true
+RUN UV_LINK_MODE=symlink uv sync --locked --inexact $UV_NO_INSTALL_PACKAGES
@@ -152,3 +152,42 @@ uv run --extra vllm tools/model_diagnostics/2.long_generation_decode_vs_prefill.
 # ...
 # [Qwen/Qwen2.5-1.5B] ALL GOOD!
 ```
+
+## [3.check_hf_model_embeddings_untrained.py](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/3.check_hf_model_embeddings_untrained.py)
+
+Detects untrained or improperly initialized Hugging Face model embeddings by scanning for near-zero rows and rows with near-identical values in both input and output embeddings. The script also reports whether word embeddings are tied and summarizes basic statistics.
+
+```sh
+# Example run
+uv run --extra mcore tools/model_diagnostics/3.check_hf_model_embeddings_untrained.py --model nvidia/Nemotron-H-8B-Base-8K
+
+# ....
+#================================================================================
+#EMBEDDING SUMMARIES
+#================================================================================
+#
+#--- Input Embeddings Summary ---
+#Shape: torch.Size([131072, 4096]), Dtype: torch.bfloat16
+#Near-zero embeddings (abs < 1.00e-10): 1039/131072 (0.8%)
+#  Indices: 0-1,3-999,1192-1193,1245-1255,55014,77579,81772,81819,82312,82500,82725,82737,82977,84020,84121,84521,84794,85015,86409,87411,89412,90320,91368,94485,96385,104097,108262,112147,112327,112497,114755
+#Identical embeddings (std < 1.00e-08): 1041/131072 (0.8%)
+#  Indices: 0-1,3-999,1192-1193,1245-1255,55014,77579,81772,81819,82312,82500,82725,82737,82977,83855,84020,84121,84521,84794,85015,86409,87411,89412,90320,91368,94485,96385,101707,104097,108262,112147,112327,112497,114755
+#Statistics: mean_abs=0.007874, max_abs=0.196289, std_range=[0.000000, 0.015442]
+#⚠️  POTENTIAL ISSUES: 1039 near-zero embeddings, 1041 identical embeddings
+#
+#--- Output Embeddings Summary (Tied: False) ---
+#Shape: torch.Size([131072, 4096]), Dtype: torch.bfloat16
+#Near-zero embeddings (abs < 1.00e-10): 0/131072 (0.0%)
+#Identical embeddings (std < 1.00e-08): 0/131072 (0.0%)
+#Statistics: mean_abs=0.006775, max_abs=0.200195, std_range=[0.004089, 0.021240]
+#✅ No obvious untrained patterns detected
+#
+#=== Final Summary ===
+#Model: nvidia/Nemotron-H-8B-Base-8K
+#Analysis complete.
+```
+
+- Thresholds can be adjusted via flags:
+  - `--near-zero-threshold` (default: `1e-10`)
+  - `--identical-threshold` (default: `1e-8`)
+- If any near-zero or identical rows are reported, the model may have issues of numerical instability (e.g., inf grad norms) during post-training if any of these problematic tokens are encountered. We have observed this happening when special tokens are reserved in the tokenizer and embedding, but none are encountered during pre-training. It may help to initialize these embeddings similar to how they were initialize during pre-training.
@@ -62,7 +62,7 @@ A key design principle for generation backends is that they process tokens direc
 
 ## VLLM Backend
 
-The VLLM backend (`models/generation/vllm.py`) implements the {py:class}`GenerationInterface <nemo_rl.models.generation.interfaces.GenerationInterface>` to provide efficient text generation using the VLLM library, which is optimized for large language models.
+The VLLM backend (`models/generation/vllm/vllm_generation.py`) implements the {py:class}`GenerationInterface <nemo_rl.models.generation.interfaces.GenerationInterface>` to provide efficient text generation using the VLLM library, which is optimized for large language models.
 
 ### VllmGeneration Class
 
 
@@ -107,7 +107,7 @@ This Policy object holds a [RayWorkerGroup](../../nemo_rl/distributed/worker_gro
 
 ## Fast Generation
 
-We support vLLM through the [VllmGeneration](../../nemo_rl/models/generation/vllm.py) class right now.
+We support vLLM through the [VllmGeneration](../../nemo_rl/models/generation/vllm/vllm_generation.py) class right now.
 
 The function [grpo_train](../../nemo_rl/algorithms/grpo.py) contains the core GRPO training loop.
 
 
@@ -41,6 +41,7 @@ policy:
   logprob_batch_size: 4
   max_total_sequence_length: 512
   precision: "bfloat16"
+  logprob_chunk_size: null
 
   dtensor_cfg:
     enabled: true
@@ -53,6 +54,65 @@ policy:
 
   megatron_cfg:
     enabled: false
+    empty_unused_memory_level: 0
+    activation_checkpointing: false
+    converter_type: "Qwen2ForCausalLM"
+    tensor_model_parallel_size: 1
+    expert_tensor_parallel_size: 1
+    expert_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    num_layers_in_first_pipeline_stage: null
+    num_layers_in_last_pipeline_stage: null
+    context_parallel_size: 1
+    pipeline_dtype: ${policy.precision}
+    sequence_parallel: false
+    freeze_moe_router: true
+    moe_router_dtype: "fp64"
+    moe_router_load_balancing_type: "none" # "seq_aux_loss" causes logprob error divergence for grpo
+    moe_router_bias_update_rate: 0.0 # by default, disable bias updates for grpo
+    #gives ~20% training perf speedup with sequence packing
+    apply_rope_fusion: True
+    defer_fp32_logits: null
+
+    optimizer:
+      optimizer: "adam"
+      lr: 5.0e-6
+      min_lr: 5.0e-7
+      weight_decay: 0.01
+      bf16: true
+      fp16: false
+      params_dtype: "float32"
+
+      #adam
+      adam_beta1: 0.9
+      adam_beta2: 0.999
+      adam_eps: 1e-8
+
+      #sgd
+      sgd_momentum: 0.9
+
+      #distributed optimizer
+      use_distributed_optimizer: true
+      use_precision_aware_optimizer: true
+
+      clip_grad: ${policy.max_grad_norm}
+
+    scheduler:
+      start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
+      weight_decay_incr_style: "constant"
+      lr_decay_style: "constant"
+      lr_decay_iters: null
+      lr_warmup_iters: 13
+      lr_warmup_init: 5.0e-7
+
+    distributed_data_parallel_config:
+      grad_reduce_in_fp32: false
+      overlap_grad_reduce: true
+      overlap_param_gather: true
+      average_in_collective: true
+      use_custom_fsdp: false
+      data_parallel_sharding_strategy: "optim_grads_params"
 
   # See docs/design-docs/sequence-packing-and-dynamic-batching.md 
   # for more details on dynamic batching and sequence packing.
 
@@ -56,9 +56,6 @@ policy:
       lr_warmup_iters: 13
       lr_warmup_init: 3.0e-8
 
-    env_vars:
-      PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
-
   generation:
     backend: "vllm"
     max_new_tokens: ${policy.max_total_sequence_length}