[Trainer] GSPO support (#120)

bthecohen · web-flow · commit bd9d6a9bace8 · 2025-08-05T11:31:29.000-07:00
This PR adds support for [Group Sequence Policy Optimization (GSPO)](https://arxiv.org/abs/2507.18071), the hotness du jour from Alibaba Qwen. The implementation in this PR is loosely based on [this one](huggingface/trl#3775) from TRL. It adds an `importance_sampling_level` config option which can be `token` (PPO/GRPO) or `sequence` (GSPO). I ran a short/small GSM8k run with Qwen2.5-0.5B and the loss curves look okay: <img width="314" height="240" alt="image" src="https://github.com/user-attachments/assets/f52d7c64-416c-4419-aa96-4a03c9048007" /> However, I had to hack a few things to get this to run on Datadog's cloud infra (including changing some dependency versions) so I'd encourage one of the maintainers to reproduce these results locally before merging.
diff --git a/skyrl-train/docs/configuration/config.rst b/skyrl-train/docs/configuration/config.rst
@@ -290,7 +290,7 @@ Algorithm Configuration
       # this adds training batch level normalization to advantages 
       advantage_batch_normalize: false
       value_head_prefix: "value_head"
-      policy_loss_type: "regular" # "regular", "dual_clip", or customizable with PolicyLossRegistry
+      policy_loss_type: "regular" # "regular", "dual_clip", "gspo", or customizable with PolicyLossRegistry
       loss_reduction: "token_mean" # "token_mean", "sequence_mean"
 
       # GAE parameters
@@ -315,8 +315,14 @@ Algorithm Configuration
 - ``algorithm.kl_loss_coef``: Coefficient for the KL divergence loss.
 - ``algorithm.advantage_batch_normalize``: Whether to normalize advantages by the (global) batch mean and standard deviation.
 - ``algorithm.value_head_prefix``: The name used to identify the value head in the critic model.
-- ``algorithm.policy_loss_type``: Type of PPO loss to use. Currently, we implement ``regular`` and ``dual_clip``, where ``regular`` is the vanilla PPO loss, while ``dual_clip`` is the dual clip PPO loss proposed in `this paper <https://arxiv.org/pdf/1912.09729>`_. Custom policy losses can be registered with the ``PolicyLossRegistry``.
-- ``algorithm.loss_reduction``: Type of PPO loss reduction to use. Currently, we support ``token_mean`` and ``sequence_mean``. ``token_mean`` matches token-level loss introduced by `DAPO <https://dapo-sia.github.io/>`_. ``sequence_mean`` computes per-sequence avg token loss, then averages over the batch.
+- ``algorithm.policy_loss_type``: Type of policy loss to use. Options include:
+
+  - ``regular``: Vanilla PPO loss with token-level importance sampling
+  - ``dual_clip``: Dual clip PPO loss proposed in `this paper <https://arxiv.org/pdf/1912.09729>`_
+  - ``gspo``: `Group Sequence Policy Optimization <https://arxiv.org/abs/2507.18071>`_ with sequence-level importance sampling for improved training stability. Implements "GSPO-token" variant from the paper.
+  - Custom policy losses can be registered with the ``PolicyLossRegistry``
+
+- ``algorithm.loss_reduction``: Type of loss reduction to use. Options are ``token_mean`` and ``sequence_mean``. ``token_mean`` matches token-level loss introduced by `DAPO <https://dapo-sia.github.io/>`_. ``sequence_mean`` computes per-sequence avg token loss, then averages over the batch.
 - ``algorithm.lambd``: Lambda parameter for GAE.
 - ``algorithm.gamma``: Gamma parameter for GAE.
 - ``algorithm.eps_clip_low``: Lower bound for PPO clipping.
diff --git a/skyrl-train/skyrl_train/config/ppo_base_config.yaml b/skyrl-train/skyrl_train/config/ppo_base_config.yaml
@@ -88,7 +88,7 @@ trainer:
     # this adds training batch level normalization to advantages 
     advantage_batch_normalize: false
     value_head_prefix: "value_head"
-    policy_loss_type: "regular" # "regular", "dual_clip", or customizable with PolicyLossRegistry
+    policy_loss_type: "regular" # "regular", "dual_clip", "gspo", or customizable with PolicyLossRegistry
     loss_reduction: "token_mean" # "token_mean", "sequence_mean"
     # GAE parameters
     lambd: 1.0
diff --git a/skyrl-train/skyrl_train/utils/ppo_utils.py b/skyrl-train/skyrl_train/utils/ppo_utils.py
@@ -397,6 +397,7 @@ class AdvantageEstimatorRegistry(BaseFunctionRegistry):
 class PolicyLossType(StrEnum):
     REGULAR = "regular"
     DUAL_CLIP = "dual_clip"
+    GSPO = "gspo"
 
 
 class PolicyLossRegistry(BaseFunctionRegistry):
@@ -483,6 +484,65 @@ def ppo_policy_loss(
     return loss, clip_ratio
 
 
+@register_policy_loss(PolicyLossType.GSPO)
+def gspo_policy_loss(
+    log_probs: torch.Tensor,
+    old_log_probs: torch.Tensor,
+    advantages: torch.Tensor,
+    config: DictConfig,
+    loss_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, float]:
+    """
+    GSPO (Group Sequence Policy Optimization) policy loss function,
+    as proposed in https://arxiv.org/abs/2507.18071.
+
+    This implements sequence-level importance sampling instead of token-level importance sampling.
+    The key difference is that importance weights are computed at the sequence level and then
+    applied uniformly across all tokens in the sequence. This can lead to more stable training
+    dynamics by reducing the variance in clipping behavior within sequences.
+
+    The variant of GSPO used here is GSPO-token, a generalization which allows for token-level
+    advantages [equations 14 and 15 in the paper].
+    """
+    # GSPO must use sequence_mean reduction
+    loss_reduction = config.loss_reduction
+    if loss_reduction != "sequence_mean":
+        # The GSPO paper uses sequence_mean reduction; there's no reason
+        # why a user couldn't use token_mean reduction, but
+        # it's not clear whether it would be stable or not.
+        from loguru import logger as logger_  # have to do lazy import to avoid pickling error
+
+        logger_.warning(f"With GSPO it's recommended to use 'sequence_mean' loss reduction; got {loss_reduction}")
+
+    # Compute log ratios
+    log_ratio = log_probs - old_log_probs
+
+    # Key GSPO innovation: sequence-level importance sampling
+    # Instead of using per-token ratios, compute sequence-averaged ratios
+    log_importance_weights = masked_mean(log_ratio, loss_mask, dim=-1).unsqueeze(-1)
+
+    # s_i,t(θ) = sg[s_i(θ)] · π_θ(y_i,t|x, y_i,<t) / sg[π_θ(y_i,t|x, y_i,<t)]
+    # In log space: log(s_i,t(θ)) = sg[log(s_i(θ))] + log_probs - sg[log_probs]
+    # note: we put the addition at the end to avoid precision issues,
+    # per https://github.com/volcengine/verl/pull/2775#discussion_r2241500280
+    log_token_importance_weights = log_probs - log_probs.detach() + log_importance_weights.detach()
+    # clip to avoid overflow
+    log_token_importance_weights = torch.clamp(log_token_importance_weights, max=10)
+    ratio = torch.exp(log_token_importance_weights)
+
+    # Standard PPO surrogate objective with sequence-level importance weights
+    surr1 = ratio * advantages
+    surr2 = ratio.clamp(1 - config.eps_clip_low, 1 + config.eps_clip_high) * advantages
+    loss = -torch.min(surr1, surr2)
+
+    # Compute clipping ratio for monitoring
+    clip_ratio = masked_mean((-surr2 > -surr1).float(), loss_mask).mean().detach().item()
+
+    loss = reduce_loss(loss, loss_mask, loss_reduction)
+
+    return loss, clip_ratio
+
+
 def reduce_loss(
     loss: torch.Tensor, loss_mask: Optional[torch.Tensor], loss_reduction: Literal["token_mean", "sequence_mean"]
 ) -> torch.Tensor:
diff --git a/skyrl-train/tests/cpu/algorithms/test_losses.py b/skyrl-train/tests/cpu/algorithms/test_losses.py
@@ -8,6 +8,7 @@
 import torch
 from omegaconf import DictConfig
 from skyrl_train.utils.ppo_utils import PolicyLossRegistry
+from skyrl_train.utils import masked_mean
 
 
 # Adapted a good test from NeMO-RL
@@ -214,3 +215,154 @@ def test_policy_loss_reduction_edge_cases():
     # Should handle zero mask gracefully (due to +1e-8 in denominator)
     assert torch.isfinite(loss_token_masked)
     assert torch.isfinite(loss_seq_masked)
+
+
+def test_gspo_importance_sampling_levels():
+    """Tests GSPO policy loss function with sequence-level importance sampling.
+
+    This test focuses on GSPO's key benefit: stabilizing clipping behavior through sequence-level
+    importance sampling, which should lead to more consistent training dynamics compared to
+    token-level importance sampling in standard PPO.
+    """
+
+    device = "cpu"
+
+    clip_eps_low = 0.2
+    clip_eps_high = 0.2
+
+    # Create test data with varied sequence lengths and extreme ratios to test clipping stability
+    # GSPO's benefit is most apparent with sequences of different lengths and high variance
+    advantages = torch.tensor(
+        [
+            [1.5, 2.0, 1.0, 0.8, 0.5, 0.0, 0.0, 0.0],  # long sequence: 5 valid tokens
+            [3.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # short sequence: 2 valid tokens
+            [0.5, 0.8, 1.2, 2.5, 0.0, 0.0, 0.0, 0.0],  # medium sequence: 4 valid tokens
+        ],
+        device=device,
+    )
+
+    old_log_probs = torch.tensor(
+        [
+            [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
+            [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
+            [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
+        ],
+        device=device,
+    )
+
+    # Create extreme log probability ratios to trigger significant clipping
+    # This tests GSPO's stability benefits under conditions that would cause unstable clipping
+    log_probs = torch.tensor(
+        [
+            [0.2, -2.5, -0.3, 0.1, -1.8, -1.0, -1.0, -1.0],  # high variance within sequence
+            [0.8, -0.2, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],  # extreme ratios (exp(1.8)≈6.0, exp(0.8)≈2.2)
+            [-0.5, 0.3, -1.7, 0.4, -1.0, -1.0, -1.0, -1.0],  # mixed extreme values
+        ],
+        device=device,
+    )
+
+    # Create masks for different sequence lengths (key for testing length normalization)
+    loss_mask = torch.tensor(
+        [
+            [1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0],  # 5 tokens
+            [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],  # 2 tokens
+            [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],  # 4 tokens
+        ],
+        device=device,
+    )
+
+    # Test standard PPO (token-level importance sampling)
+    ppo_config = DictConfig(
+        {
+            "eps_clip_low": clip_eps_low,
+            "eps_clip_high": clip_eps_high,
+            "clip_ratio_c": 3.0,
+            "policy_loss_type": "regular",
+            "loss_reduction": "token_mean",
+        }
+    )
+    ppo_loss_fn = PolicyLossRegistry.get("regular")
+    loss_token, _ = ppo_loss_fn(log_probs, old_log_probs, advantages, ppo_config, loss_mask)
+
+    # Test GSPO (sequence-level importance sampling)
+    gspo_config = DictConfig(
+        {
+            "eps_clip_low": clip_eps_low,
+            "eps_clip_high": clip_eps_high,
+            "clip_ratio_c": 3.0,
+            "policy_loss_type": "gspo",
+            "loss_reduction": "sequence_mean",  # GSPO recommended reduction
+        }
+    )
+    gspo_loss_fn = PolicyLossRegistry.get("gspo")
+    loss_sequence, _ = gspo_loss_fn(log_probs, old_log_probs, advantages, gspo_config, loss_mask)
+
+    # Manual calculation for token-level (standard PPO)
+    log_ratio = log_probs - old_log_probs
+    ratio_token = log_ratio.exp()
+    surr1_token = ratio_token * advantages
+    surr2_token = ratio_token.clamp(1 - clip_eps_low, 1 + clip_eps_high) * advantages
+    loss_per_token_token = -torch.min(surr1_token, surr2_token)
+    expected_token = (loss_per_token_token * loss_mask).sum() / (loss_mask.sum() + 1e-8)
+
+    # Calculate token-level clipping ratio
+    is_clipped_token = (-surr2_token > -surr1_token) & (loss_mask.bool())
+    clip_ratio_token = is_clipped_token.float().sum() / loss_mask.sum()
+
+    # Manual calculation for sequence-level (GSPO)
+    # First compute sequence-level importance weights (key GSPO innovation)
+    log_importance_weights_seq = masked_mean(log_ratio, loss_mask, dim=-1).unsqueeze(-1)
+
+    # GSPO uses stop gradients: s_i,t(θ) = sg[s_i(θ)] · π_θ(y_i,t|x, y_i,<t) / sg[π_θ(y_i,t|x, y_i,<t)]
+    # In log space: log(s_i,t(θ)) = sg[log(s_i(θ))] + log_probs - sg[log_probs]
+    ratio_sequence = torch.exp(log_importance_weights_seq.detach() + log_probs - log_probs.detach())
+    surr1_sequence = ratio_sequence * advantages
+    surr2_sequence = ratio_sequence.clamp(1 - clip_eps_low, 1 + clip_eps_high) * advantages
+    loss_per_token_sequence = -torch.min(surr1_sequence, surr2_sequence)
+    # GSPO uses sequence_mean reduction
+    expected_sequence = masked_mean(loss_per_token_sequence, loss_mask, dim=-1).mean()
+
+    # Calculate sequence-level clipping ratio
+    is_clipped_sequence = (-surr2_sequence > -surr1_sequence) & (loss_mask.bool())
+    clip_ratio_sequence = is_clipped_sequence.float().sum() / loss_mask.sum()
+
+    # Verify loss calculations
+    torch.testing.assert_close(loss_token, expected_token, rtol=1e-5, atol=1e-8)
+    torch.testing.assert_close(loss_sequence, expected_sequence, rtol=1e-5, atol=1e-8)
+
+    # Core GSPO benefit test: Different clipping behavior
+    # GSPO should produce different clipping patterns due to sequence-level importance sampling
+    assert not torch.allclose(
+        clip_ratio_token, clip_ratio_sequence, rtol=1e-2
+    ), f"Clipping ratios should differ: token={clip_ratio_token:.4f} vs sequence={clip_ratio_sequence:.4f}"
+
+    # Test stability: sequence-level should smooth out extreme per-token variations
+    # Check that sequence-level ratios have lower variance within each sequence
+    token_ratio_variance = torch.var(ratio_token * loss_mask, dim=-1).mean()
+    sequence_ratio_variance = torch.var(ratio_sequence * loss_mask, dim=-1).mean()
+
+    # The key insight: GSPO should reduce within-sequence variance by using sequence-averaged ratios
+    assert sequence_ratio_variance < token_ratio_variance, (
+        f"GSPO should reduce ratio variance: sequence={sequence_ratio_variance:.4f} < "
+        f"token={token_ratio_variance:.4f}"
+    )
+
+    # Token-level and sequence-level should give different results due to different importance weighting
+    assert not torch.allclose(
+        loss_token, loss_sequence, rtol=1e-3
+    ), f"Loss values should differ: token={loss_token:.6f} vs sequence={loss_sequence:.6f}"
+
+    # Test length normalization effect: sequences with different lengths should be handled more uniformly
+    # This is a key stability benefit of GSPO mentioned in the paper
+    seq_lengths = loss_mask.sum(dim=-1)  # [5, 2, 4]
+
+    # In GSPO, the sequence-level importance weights should be the same across all tokens in a sequence
+    # This should make the treatment more uniform across different sequence lengths
+    for seq_idx in range(log_importance_weights_seq.shape[0]):
+        seq_len = int(seq_lengths[seq_idx])
+        if seq_len > 1:
+            # All importance weights within a sequence should be identical (GSPO property)
+            seq_weights = log_importance_weights_seq[seq_idx, :seq_len]
+            assert torch.allclose(
+                seq_weights, seq_weights[0], rtol=1e-6
+            ), f"GSPO should have uniform importance weights within sequence {seq_idx}"