NVIDIA-NeMo
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 3 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎3rdparty/Megatron-Bridge-workspace/Megatron-Bridge‎ b/‎3rdparty/Megatron-Bridge-workspace/Megatron-Bridge‎
diff --git a/‎3rdparty/Megatron-Bridge-workspace/setup.py‎
Lines changed: 3 additions & 4 deletions b/‎3rdparty/Megatron-Bridge-workspace/setup.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎3rdparty/Megatron-LM-workspace/Megatron-LM‎ b/‎3rdparty/Megatron-LM-workspace/Megatron-LM‎
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/guides/async-grpo.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/guides/async-grpo.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/guides/grpo.md‎
Lines changed: 4 additions & 5 deletions b/‎docs/guides/grpo.md‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎docs/nsys-profiling.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/nsys-profiling.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/configs/distillation_math.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/configs/distillation_math.yaml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/configs/distillation_math_megatron.yaml‎
Lines changed: 0 additions & 1 deletion b/‎examples/configs/distillation_math_megatron.yaml‎
Lines changed: 0 additions & 1 deletion
@@ -1,12 +1,12 @@
 [submodule "3rdparty/Megatron-LM"]
 	path = 3rdparty/Megatron-LM-workspace/Megatron-LM
-	url = https://github.com/ashors1/Megatron-LM.git
-	branch = gpt-oss-tot2
+	url = https://github.com/terrykong/Megatron-LM.git
+	branch = ashors/dev-with-gpt-oss
 	shallow = true
 [submodule "3rdparty/Megatron-Bridge"]
 	path = 3rdparty/Megatron-Bridge-workspace/Megatron-Bridge
 	url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
-	branch = main
+	branch = ashors/gpt-oss-tot
 	shallow = true
 [submodule "3rdparty/Automodel-workspace/Automodel"]
 	path = 3rdparty/Automodel-workspace/Automodel
 
@@ -37,13 +37,12 @@
     "pyyaml>=6.0.2",
     "tqdm>=4.67.1",
     "hydra-core>1.3,<=1.3.2",
+    "megatron-core[dev,mlm]>=0.15.0a0,<0.16.0",
     "qwen-vl-utils",
-    "causal-conv1d",
+    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
     "mamba-ssm",
-    "megatron-core[dev,mlm]>=0.15.0a0,<0.16.0",
     "nvidia-resiliency-ext",
-    "transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
-    "transformers>=4.57.1",
+    "causal-conv1d",
 ]
 
 # If the bridge source exists, compare cached dependencies with the submodule's pyproject
 
@@ -8,7 +8,7 @@
   DAPO extends GRPO with **Clip-Higher**, **Dynamic Sampling**, **Token-Level Policy Gradient Loss**, and **Overlong Reward Shaping** for more stable and efficient RL training. See the [DAPO guide](docs/guides/dapo.md) for more details.
 * [9/30/2025][Accelerated RL on GCP with NeMo RL!](https://discuss.google.dev/t/accelerating-reinforcement-learning-on-google-cloud-using-nvidia-nemo-rl/269579/4) 
 * [9/27/2025] [FP8 Quantization in NeMo RL](https://github.com/NVIDIA-NeMo/RL/discussions/1216)
-* [9/25/2025] On-policy Distillation (Qwen3-style)
+* [9/25/2025] On-policy Distillation 
     * Student generates on-policy sequences and aligns logits to a larger teacher via KL, achieving near-larger-model quality at lower cost than RL. See [On-policy Distillation](#on-policy-distillation).
 
 <details>
@@ -71,12 +71,12 @@ For detailed information on backend selection, configuration, and examples, see
 - 🔜 **Megatron Bridge Integration** - Integrate Megatron Bridge to enable training features from Megatron Core.
 - 🔜 **NeMo Automodel Integration** - Integrate NeMo Automodel to power our DTensor path.
 - 🔜 **New Models** - gpt-oss.
-- 🔜 **Expand Algorithms** - DAPO, GSPO, On-policy Distillation.
+- 🔜 **Expand Algorithms** - DAPO, GSPO.
 - 🔜 **GB200** - Add container support for GB200.
 - ✅ **Distributed Training** - Ray-based infrastructure.
 - ✅ **Environment Support and Isolation** - Support for multi-environment training and dependency isolation between components.
 - ✅ **Worker Isolation** - Process isolation between RL Actors (no worries about global state).
-- ✅ **Learning Algorithms** - GRPO/GSPO, SFT, and DPO.
+- ✅ **Learning Algorithms** - GRPO/GSPO, SFT, DPO, and On-policy distillation.
 - ✅ **Multi-Turn RL** - Multi-turn generation and training for RL with tool use, games, etc.
 - ✅ **Advanced Parallelism with DTensor** - PyTorch FSDP2, TP, CP, and SP for efficient training.
 - ✅ **Larger Model Support with Longer Sequences** - Performant parallelisms with Megatron Core (TP/PP/CP/SP/EP/FSDP).
 
@@ -41,6 +41,8 @@ grpo:
   async_grpo:
     enabled: true
     max_trajectory_age_steps: 1  # Maximum age, in training steps, for trajectories
+    in_flight_weight_updates: false  # Enable for faster weight synchronization
+    recompute_kv_cache_after_weight_updates: false # Invalidates kv cache after in-flight-weight-updates
 ```
 
 ### Complete Example Config
@@ -65,6 +67,8 @@ grpo:
   async_grpo:
     enabled: true
     max_trajectory_age_steps: 1
+    in_flight_weight_updates: false  # Enable for faster weight synchronization
+    recompute_kv_cache_after_weight_updates: false # Invalidates kv cache after in-flight-weight-updates
 
 cluster:
   num_nodes: 2
@@ -158,6 +162,11 @@ sequenceDiagram
 
 3. **Resource Allocation**: Ensure sufficient GPU memory for both the training and generation clusters
 
+4. **In-Flight Weight Updates**: Enable `in_flight_weight_updates: true` when using `async_engine: true` for updating the weights of vLLM engine during generation. This prevents stalling training pipeline until longest generation finishes and provides significant performance benefits.
+
+5. **Recompute KV Cache After Weight Updates**: While using in-flight weight update, user can choose whether to recompute
+KV caches after weight udpate by configuring `recompute_kv_cache_after_weight_update` configuration.
+
 ## Why Importance Sampling Correction Is Required for Async
 
 ### The GRPO Objective
 
@@ -28,7 +28,7 @@ In this guide, we'll walk through how we handle:
 
 We support training with multiple RL "Environments" at the same time.
 
-An [Environment](../../nemo_rl/environments/interfaces.py) is an object that accepts a state/action history and returns an update state and rewards for the step. They run as Ray Remote Actors. Example [MathEnvironment](../../nemo_rl/environments/math_environment.py).
+An [Environment](../../nemo_rl/environments/interfaces.py) is an object that accepts a state/action history and returns an updated state and rewards for the step. They run as Ray Remote Actors. Example [MathEnvironment](../../nemo_rl/environments/math_environment.py).
 
 To support this, we need to know:
 
@@ -163,9 +163,8 @@ L(\theta) = E_t \Big[ \max \Big( \min \big(r_t(\theta) A_t, \text{clip}(r_t(\the
 $$
 
 where:
-- c is the dual-clip parameter (ratio_clip_c), which must be greater than 1 and is
-    usually set as 3 empirically
-- $r_t(\theta)$ is the ratio $\frac{\pi_\theta(x)}{\pi_{\theta_{\text{old}}}(x)}$ that measures how much the policy has change
+- c is the dual-clip parameter (ratio_clip_c), which must be greater than 1 and is usually set as 3 empirically
+- $r_t(\theta)$ is the ratio $\frac{\pi_\theta(x)}{\pi_{\theta_{\text{old}}}(x)}$ that measures how much the policy has changed
 
 ### Improvements to the GRPO Loss Formulation for Stability and Accuracy
 
@@ -279,7 +278,7 @@ We observed a case where vLLM assigned a disproportionately high probability to
    logp_gen     (from vLLM):      -5.xxx
    logp_policy (from Mcore):      -15.xxx
 ```
-Assuming other tokens have near-zero divergence, this single token's metrics are:
+Assuming other tokens have near-zero divergence, this single token's metrics with `kl_type=k3` are:
 
 * `gen_kl_error`: exp(-15 + 5) - (-15 + 5) - 1 ≈ 9 (moderate mismatch)
 * `policy_kl_error`: exp(-5 + 15) - (-5 + 15) - 1 ≈ 22,015 (severe mismatch dominating the metric)
 
@@ -100,7 +100,8 @@ To analyze the generated profile files, load the `.nsys-rep` files into the NVID
 
 Nsight Systems supports [multi-report view](https://docs.nvidia.com/nsight-systems/UserGuide/index.html#viewing-multiple-reports-in-the-same-timeline) functionality. If you open the profiles from different workers (e.g., `*policy_worker*.nsys-rep` and `*generation_worker*.nsys-rep`) in a single multi-report view, you can analyze the behavior of the end-to-end RL loop on the same timeline.
 
-<img src="assets/nsys-multi-report-view.png" alt="Pretraining loss curves" width="1000"/>
+
+![Nsys multi report view](./assets/nsys-multi-report-view.png)
 
 ## How We Patched Nsight Support in Ray
 
 
@@ -145,7 +145,6 @@ policy: &POLICY_BASE
             grad_reduce_in_fp32: false
             overlap_grad_reduce: true
             overlap_param_gather: true
-            average_in_collective: true
             use_custom_fsdp: false
             data_parallel_sharding_strategy: "optim_grads_params"
 
 
@@ -99,7 +99,6 @@ policy: &POLICY_BASE
             grad_reduce_in_fp32: false
             overlap_grad_reduce: true
             overlap_param_gather: true
-            average_in_collective: true
             use_custom_fsdp: false
             data_parallel_sharding_strategy: "optim_grads_params"