NVIDIA-NeMo · terrykong · May 13, 2025 · May 13, 2025
@@ -13,8 +13,10 @@ loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
   ratio_clip_max: 0.2
+  ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-gemma3-1b-it-1n8g-fsdp2tp1
@@ -75,6 +77,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512

@@ -13,8 +13,10 @@ loss_fn:
   reference_policy_kl_penalty: 0.01
   ratio_clip_min: 0.2
   ratio_clip_max: 0.2
+  ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long
@@ -75,6 +77,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6
       max_model_len: 16384

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long
@@ -77,6 +78,7 @@ policy:
       - 128009
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1
@@ -77,6 +78,7 @@ policy:
       - 128009
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long
@@ -77,6 +78,7 @@ policy:
       - 151643
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6
       max_model_len: 16384

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt
@@ -77,6 +78,7 @@ policy:
       - 151643
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6
       max_model_len: 16384

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp1
@@ -77,6 +78,7 @@ policy:
       - 151645
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp
@@ -77,6 +78,7 @@ policy:
       - 151645
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 4
       gpu_memory_utilization: 0.6
       max_model_len: 4096

@@ -16,6 +16,7 @@ loss_fn:
   ratio_clip_c: null
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
+  token_level_loss: true
 checkpointing:
   enabled: true
   checkpoint_dir: results/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1
@@ -77,6 +78,7 @@ policy:
       - 151645
     stop_strings: null
     vllm_cfg:
+      precision: ${policy.precision}
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512