mini-grpo/cli.py at master · RobotSail/mini-grpo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import requests
from transformers import GenerationConfig
import json
import random
from typer import Typer
import typer
import re
import pydantic
import datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Qwen2Tokenizer,
    PreTrainedModel,
)
import torch
from torch.optim import AdamW
import os
from IPython import embed
from tqdm import tqdm

try:
    import wandb

    WANDB_AVAILABLE = True
except ImportError:
    WANDB_AVAILABLE = False
    wandb = None

from instructlab.training.data_process import (
    configure_tokenizer,
)

from data_utils import (
    generate_dataset,
    dataset_from_groups,
    create_grpo_data_loader,
    load_gsm8k,
    split_batch_into_microbatches,
)
from utils import preview_tokenization, display_scorecard
from optimizers import create_optimizer, create_fsdp2_muon_optimizer
from type_defs import (
    Problem,
    SamplingParams,
    TokenSample,
    RolloutResult,
    Sample,
    TrainingComponents,
    Hyperparameters,
)


# Regex pattern to match <answer>...</answer> tags
answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL | re.IGNORECASE)


app = Typer()


def send_chat_completion(
    prompt: str,
    system_prompt: str,
    model: str = "qwen/Qwen2-1.5B-Instruct",
    base_url: str = "http://localhost:8000/v1",
    temperature: float = 0.7,
    max_tokens: int = 512,
):
    """Send a chat completion request to vLLM server."""
    url = f"{base_url}/chat/completions"

    headers = {"Content-Type": "application/json"}

    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt},
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }

    response = requests.post(url, headers=headers, json=data)
    response.raise_for_status()

    return response.json()


def parse_number(text: str) -> float:
    """
    Parse a string into a float, handling common formats from GSM8K answers.

    Handles:
    - Whitespace (leading/trailing/internal)
    - Percentage signs (42% -> 42.0)
    - Currency symbols ($100, EUR50, etc.)
    - Comma separators (1,000,000 -> 1000000)
    - Negative numbers (-42, negative prefix)
    - Decimal numbers (3.14)

    Returns: float
    Raises: ValueError if no valid number can be parsed
    """
    if not text or not isinstance(text, str):
        raise ValueError(f"Empty or invalid input: {text}")

    # Strip whitespace
    text = text.strip()

    # Remove currency symbols ($, EUR, GBP, JPY, etc.)
    text = re.sub(r"[$\u20AC\u00A3\u00A5\u20B9]", "", text)

    # Remove percentage sign (keep the number)
    text = text.replace("%", "")

    # Remove commas (thousand separators)
    text = text.replace(",", "")

    # Strip remaining whitespace after removals
    text = text.strip()

    # Check for digits
    if not any(c.isdigit() for c in text):
        raise ValueError(f"No digits found in answer: {text}")

    # Extract the numeric portion (handles cases like "42 dollars" -> "42")
    match = re.search(r"-?\d+\.?\d*", text)
    if not match:
        raise ValueError(f"Could not extract number from: {text}")

    return float(match.group())


@app.command()
def generate_data(
    # system_msg: str,
    system_msg="You are a helpful math assistant. Always provide your final numerical answer inside of the <answer>...</answer> tags, e.g.: <answer>42</answer>",
    num_problems: int = 20,
    min_num: int = -100,
    max_num: int = 100,
    seed: int = 42,
    model_name: str = "qwen/Qwen2-1.5B-Instruct",
    output_dir: str = "generated_data",
    test_split: float = 0.0,
    max_seq_len: int = 8192,
):
    # this is the dataset
    dataset: datasets.Dataset = generate_dataset(
        system_msg=system_msg,
        seed=seed,
        num_problems=num_problems,
        min_num=min_num,
        max_num=max_num,
    )
    if test_split > 0:
        dataset_dict = dataset.train_test_split(test_split)
        train, test = dataset_dict["train"], dataset_dict["test"]
    else:
        train = dataset
        test = None
    os.makedirs(output_dir, exist_ok=True)

    # write out training data
    train_path = os.path.join(output_dir, "train.jsonl")
    train.to_json(train_path)
    typer.secho(
        f"✓ Generated {len(train)} training examples",
        fg=typer.colors.GREEN,
    )
    typer.secho(
        f"✓ Saved training data to '{train_path}'",
        fg=typer.colors.BLUE,
    )

    # write out test data if it exists
    if test:
        test_path = os.path.join(output_dir, "test.jsonl")
        test.to_json(test_path)
        typer.secho(
            f"✓ Generated {len(test)} test examples",
            fg=typer.colors.GREEN,
        )
        typer.secho(
            f"✓ Saved test data to '{test_path}'",
            fg=typer.colors.BLUE,
        )


@app.command()
def generate_gsm8k(
    system_msg: str = typer.Option(
        "You are a helpful math assistant. Always provide your final numerical answer inside of the <answer>...</answer> tags, e.g.: <answer>42</answer>",
        "--system-msg",
        help="System message to use for the chat format",
    ),
    seed: int = typer.Option(67, help="Random seed for train/test split"),
    output_dir: str = typer.Option("generated_data", help="Directory to save the dataset"),
    test_split: float = typer.Option(0.0, help="Fraction of data to use for test set"),
):
    """Load GSM8K dataset and save it in the format expected by this repo."""
    train_dataset, test_dataset = load_gsm8k(
        system_msg=system_msg,
        eval_split=test_split,
        seed=seed,
    )

    os.makedirs(output_dir, exist_ok=True)

    # Write out training data
    train_path = os.path.join(output_dir, "gsm8k_train.jsonl")
    train_dataset.to_json(train_path)
    typer.secho(
        f"✓ Generated {len(train_dataset)} training examples from GSM8K",
        fg=typer.colors.GREEN,
    )
    typer.secho(
        f"✓ Saved training data to '{train_path}'",
        fg=typer.colors.BLUE,
    )

    # Write out test data if it exists
    if test_dataset:
        test_path = os.path.join(output_dir, "gsm8k_test.jsonl")
        test_dataset.to_json(test_path)
        typer.secho(
            f"✓ Generated {len(test_dataset)} test examples from GSM8K",
            fg=typer.colors.GREEN,
        )
        typer.secho(
            f"✓ Saved test data to '{test_path}'",
            fg=typer.colors.BLUE,
        )


def _clean_calculator_annotations(text: str) -> str:
    """Remove <<a op b=c>> calculator annotation patterns from GSM8K answers."""
    return re.sub(r"<<[^>]+>>", "", text)


def _reformat_to_answer_tags(answer: str) -> str:
    """Replace GSM8K's '#### <ans>' format with '<answer>{ans}</answer>' tags."""
    pattern = r"####\s*(.+)$"
    match = re.search(pattern, answer, re.MULTILINE)
    if match:
        final_ans = match.group(1).strip()
        return re.sub(pattern, f"<answer>{final_ans}</answer>", answer, flags=re.MULTILINE)
    return answer


def _create_sft_message(question: str, answer: str, system_msg: str) -> dict:
    """Create a single SFT sample in messages format."""
    cleaned = _clean_calculator_annotations(answer)
    reformatted = _reformat_to_answer_tags(cleaned)
    return {
        "messages": [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": question},
            {"role": "assistant", "content": reformatted},
        ]
    }


@app.command()
def generate_sft_gsm8k(
    system_msg: str = typer.Option(
        "You are a helpful math assistant. Always provide your final numerical answer inside of the <answer>...</answer> tags, e.g.: <answer>42</answer>",
        "--system-msg",
        help="System message to use for the chat format",
    ),
    seed: int = typer.Option(67, help="Random seed for train/test split"),
    output_dir: str = typer.Option("generated_data", help="Directory to save the dataset"),
    test_split: float = typer.Option(0.0, help="Fraction of data to use for test set"),
):
    """
    Generate SFT training data from GSM8K in messages format.

    This command processes GSM8K to create SFT-ready data by:
    - Removing calculator annotations (<<a+b=c>>)
    - Converting #### answers to <answer>X</answer> format
    - Formatting as chat messages with system/user/assistant roles
    """
    # load GSM8K
    gsm8k = datasets.load_dataset("openai/gsm8k", "main", split="train")
    typer.secho(f"✓ Loaded {len(gsm8k)} samples from GSM8K", fg=typer.colors.GREEN)

    # process into SFT format
    sft_samples = []
    for i in range(len(gsm8k)):
        sample = _create_sft_message(
            question=gsm8k["question"][i],
            answer=gsm8k["answer"][i],
            system_msg=system_msg,
        )
        sft_samples.append(sample)

    # convert to HF dataset for easy splitting and saving
    sft_dataset = datasets.Dataset.from_list(sft_samples)

    # handle train/test split
    if test_split > 0:
        split_data = sft_dataset.train_test_split(test_size=test_split, seed=seed)
        train_dataset = split_data["train"]
        test_dataset = split_data["test"]
    else:
        train_dataset = sft_dataset
        test_dataset = None

    os.makedirs(output_dir, exist_ok=True)

    # write training data
    train_path = os.path.join(output_dir, "gsm8k_sft_train.jsonl")
    train_dataset.to_json(train_path)
    typer.secho(
        f"✓ Generated {len(train_dataset)} SFT training examples",
        fg=typer.colors.GREEN,
    )
    typer.secho(f"✓ Saved to '{train_path}'", fg=typer.colors.BLUE)

    # write test data if split was requested
    if test_dataset:
        test_path = os.path.join(output_dir, "gsm8k_sft_test.jsonl")
        test_dataset.to_json(test_path)
        typer.secho(
            f"✓ Generated {len(test_dataset)} SFT test examples",
            fg=typer.colors.GREEN,
        )
        typer.secho(f"✓ Saved to '{test_path}'", fg=typer.colors.BLUE)

    # show a sample for verification
    typer.secho("\n--- Sample Output ---", fg=typer.colors.BRIGHT_CYAN)
    sample = sft_samples[0]
    for msg in sample["messages"]:
        role = msg["role"].upper()
        content = msg["content"][:150] + "..." if len(msg["content"]) > 150 else msg["content"]
        typer.secho(f"[{role}]: {content}", fg=typer.colors.WHITE)


@torch.no_grad
def generate_rollouts(
    model: PreTrainedModel,
    tokenizer: AutoTokenizer,
    batch: dict[str, list[any]],
    batch_size: int,
    group_size: int,
    sampling_params: SamplingParams,
    show_tqdm=False,
) -> list[Sample]:
    model.eval()
    device = next(p.device for p in model.parameters())
    # here we need to create a set of rollouts for each prompt
    groups: list[Sample] = []

    iterator = range(batch_size)
    if show_tqdm:
        iterator = tqdm(
            iterator,
            desc="Generating rollouts",
            leave=False,  # Don't leave the bar after completion
            position=1,  # Nested position to avoid conflicts with outer bar
        )

    for i in iterator:
        # TODO: optimize this
        # Preview the messages for this batch item
        # if i == 0:  # Only preview the first item to avoid clutter
        #     typer.secho(f"\n[Batch {i}] Messages:", fg=typer.colors.BRIGHT_CYAN)
        #     for msg in batch["messages"][i]:
        #         typer.secho(
        #             f"  [{msg['role']}]: {msg['content']}", fg=typer.colors.CYAN
        #         )
        input_ids = tokenizer.apply_chat_template(
            conversation=batch["messages"][i],
            return_tensors="pt",
            add_generation_prompt=True,
        ).to(device=device)

        # now we sample
        outputs = model.generate(
            input_ids,
            attention_mask=torch.ones_like(input_ids),
            max_new_tokens=sampling_params.max_new_tokens,
            num_return_sequences=group_size,
            do_sample=True,
            temperature=sampling_params.temperature,
            top_k=sampling_params.top_k,
            top_p=sampling_params.top_p,
            repetition_penalty=sampling_params.repetition_penalty,
            # output_logits=True,
            output_scores=True,
            return_dict_in_generate=True,
        )
        input_len = input_ids.numel()
        new_tokens = outputs.sequences[:, input_len:]

        # for each sample in the batch we append the generated responses as they're parsed back from the model
        # this should align with the rollout ordering that we get from the batch
        # TODO: vectorize logprob gathering

        # embed()

        # we recollect the sample by combining across the column dimension
        seed_sample = {k: v[i] for k, v in batch.items()}
        rollout_data: list[RolloutResult] = []
        problem = Problem(
            answer=seed_sample["answer"],
            operation=seed_sample["operation"],
            problem=seed_sample["problem"],
        )

        # go through each sequence and grab the respective logprob
        # TODO: optimize this part
        for seq_idx, seq in enumerate(new_tokens.tolist()):
            logprobs: list[TokenSample] = []

            # stop processing after the model generated EOS token
            try:
                seq_end = seq.index(tokenizer.eos_token_id) + 1
            except ValueError:
                # fallback to full sequence
                seq_end = len(seq)

            # next, we just need to select the probs for our specific tokens
            # Cast to FP32 for precise log_softmax over large vocab (model forward stays in BF16)
            processed_logits = torch.stack([t[seq_idx] for t in outputs.scores[:seq_end]])
            processed_logits_f32 = processed_logits.float()
            ref_logprobs = processed_logits_f32.log_softmax(dim=-1)
            index = torch.tensor(seq[:seq_end], dtype=torch.long, device=processed_logits.device)
            index = index.unsqueeze(-1)  # extend from (T,) into (T, 1)
            probs = ref_logprobs.gather(dim=-1, index=index)
            probs = probs.squeeze(-1)  # (T, 1) --> (T,)
            index = index.squeeze(-1)  # (T, 1) --> (T,)

            for tok, prob in zip(index.tolist(), probs.tolist()):
                logprobs.append(
                    TokenSample(
                        token=tok,
                        logprob=prob,
                    )
                )

            # here we append the rollout data
            policy_response = tokenizer.decode(new_tokens[seq_idx], skip_special_tokens=True)
            rollout_data.append(
                RolloutResult(
                    logprobs=logprobs,
                    response=policy_response,
                    seed_messages=seed_sample["messages"],
                )
            )

        assert input_ids.ndim > 1
        groups.append(
            Sample(
                problem=problem,
                rollouts=rollout_data,
                input_ids=input_ids.tolist()[0],  # record the input ids so we can reuse them later
            )
        )

    for group in groups:
        grade_groups(group)
        calculate_advantage(group)

    # empty cache
    torch.cuda.empty_cache()
    return groups


@torch.no_grad
def grade_groups(group: Sample):
    """
    Given a batch of samples, calculates the advantage for each one.
    Modifies objects in place.

    Grading rules:
    - Use the LAST <answer>...</answer> tag if multiple present (final answer after reasoning)
    - +0.1 reward for parsable format
    - +1.0 reward for correct answer
    """
    for rollout in group.rollouts:
        # Defaults
        rollout.is_parsable = False
        rollout.is_correct = False
        rollout.reward = 0

        # Find all answer tags
        matches = answer_pattern.findall(rollout.response)

        if not matches:
            # No answer tags found - no reward
            continue

        # Take the LAST answer (final answer after reasoning)
        last_match = matches[-1]

        try:
            parsed_answer = parse_number(last_match)
            rollout.is_parsable = True

            # Format reward for proper answer structure
            rollout.reward += 0.1

            # Check correctness with tolerance for floating point comparison
            expected = float(group.problem.answer)
            if abs(parsed_answer - expected) < 1e-6:
                rollout.is_correct = True
                rollout.reward += 1.0

        except ValueError:
            # Could not parse the last answer - no parsable reward
            pass


def print_example_rollout(samples: list[Sample], step: int = 0):
    """Print batch statistics and example rollouts after generation."""
    if not samples:
        return

    # Calculate batch statistics
    total_rollouts = sum(len(s.rollouts) for s in samples)
    total_rewards = sum(r.reward for s in samples for r in s.rollouts)
    parsable_count = sum(1 for s in samples for r in s.rollouts if r.is_parsable)
    correct_count = sum(1 for s in samples for r in s.rollouts if r.is_correct)

    avg_reward = total_rewards / total_rollouts if total_rollouts > 0 else 0.0
    parsable_rate = parsable_count / total_rollouts if total_rollouts > 0 else 0.0
    correct_rate = correct_count / total_rollouts if total_rollouts > 0 else 0.0

    # Print batch statistics
    typer.secho(f"\n{'=' * 70}", fg=typer.colors.BRIGHT_MAGENTA)
    typer.secho(f"  ROLLOUT SUMMARY (Step {step})", fg=typer.colors.BRIGHT_MAGENTA, bold=True)
    typer.secho(f"{'=' * 70}", fg=typer.colors.BRIGHT_MAGENTA)

    typer.secho(f"\n[BATCH STATISTICS]:", fg=typer.colors.BRIGHT_CYAN)
    typer.secho(
        f"  Prompts: {len(samples)} | Rollouts: {total_rollouts} | Rollouts/Prompt: {total_rollouts // len(samples)}",
        fg=typer.colors.WHITE,
    )
    typer.secho(
        f"  Parsable: {parsable_count}/{total_rollouts} ({parsable_rate:.1%})",
        fg=typer.colors.GREEN if parsable_rate > 0.5 else typer.colors.YELLOW,
    )
    typer.secho(
        f"  Correct:  {correct_count}/{total_rollouts} ({correct_rate:.1%})",
        fg=typer.colors.GREEN if correct_rate > 0.3 else typer.colors.YELLOW,
    )
    typer.secho(f"  Avg Reward: {avg_reward:.4f}", fg=typer.colors.CYAN)

    # Find one correct and one incorrect example for comparison
    correct_example = None
    incorrect_example = None

    for sample in samples:
        for rollout in sample.rollouts:
            if rollout.is_correct and correct_example is None:
                correct_example = (sample, rollout)
            elif not rollout.is_correct and incorrect_example is None:
                incorrect_example = (sample, rollout)
            if correct_example and incorrect_example:
                break
        if correct_example and incorrect_example:
            break

    # Print examples
    examples_to_print = []
    if correct_example:
        examples_to_print.append(("CORRECT", correct_example, typer.colors.GREEN))
    if incorrect_example:
        examples_to_print.append(("INCORRECT", incorrect_example, typer.colors.RED))

    # Fallback: if no correct/incorrect distinction, just show first rollout
    if not examples_to_print and samples and samples[0].rollouts:
        examples_to_print.append(("EXAMPLE", (samples[0], samples[0].rollouts[0]), typer.colors.WHITE))

    for label, (sample, rollout), color in examples_to_print:
        typer.secho(f"\n[{label} ROLLOUT]:", fg=color, bold=True)

        # Print the user prompt (skip system message for brevity)
        user_msg = next((m for m in rollout.seed_messages if m.role == "user"), None)
        if user_msg:
            prompt_preview = user_msg.content[:150] + ("..." if len(user_msg.content) > 150 else "")
            typer.secho(f"  Prompt: {prompt_preview}", fg=typer.colors.YELLOW)

        # Print the response (truncated)
        response_preview = rollout.response[:400] + ("..." if len(rollout.response) > 400 else "")
        typer.secho(f"  Response: {response_preview}", fg=typer.colors.WHITE)

        # Print grading
        typer.secho(
            f"  Expected: {sample.problem.answer} | Parsable: {rollout.is_parsable} | Correct: {rollout.is_correct} | Reward: {rollout.reward:.2f}",
            fg=color,
        )

    typer.secho(f"{'=' * 70}\n", fg=typer.colors.BRIGHT_MAGENTA)


# i dont think we even have tensors flowing through this function but you
# can never be too sure.
@torch.no_grad
def calculate_advantage(group: Sample):
    r"""
    This is the fun part, we have to implement the GRPO-style
    advantage calculation. Basically we take each set of rollouts as a single
    group and we calculate a group-level advantage as a workaround for
    not being able to calculate RTG or step-level advantage as in vanilla REINFORCE.

    Formula looks like this:

    $$
    A_i = \frac{r_i - \mean(r)}{\std(r) + \epsilon}
    $$
    """
    eps = 1e-8
    avg = sum(r.reward for r in group.rollouts) / len(group.rollouts)
    var = sum((r.reward - avg) ** 2 for r in group.rollouts) / len(group.rollouts)
    std = var**0.5

    # if std < eps (because all rewards are equal) we use the std trick
    # of setting group advantage to 0
    enable_std_trick = std < eps

    # GRPO simple advantage with clamping to prevent extreme values
    for rollout in group.rollouts:
        if enable_std_trick:
            rollout.advantage = 0.0
        else:
            adv = (rollout.reward - avg) / (std + eps)
            # Clamp advantages to prevent extreme policy updates
            rollout.advantage = max(-10.0, min(10.0, adv))


@torch.no_grad
def eval_model(
    eval_dataset: datasets.Dataset,
    comps: TrainingComponents,
    return_metrics: bool = False,
) -> dict | None:
    """
    Evaluate model on dataset.

    Args:
        eval_dataset: Dataset to evaluate on
        comps: Training components
        return_metrics: If True, return metrics dict instead of just printing

    Returns:
        If return_metrics=True, returns dict with metrics
    """
    comps.model.eval()

    # we generate all the rollouts
    eval_data = eval_dataset.batch(eval_dataset.num_rows)
    pass_at = [
        1,
    ]  #  3,#  5, 10]
    results = []

    for npass in pass_at:
        samples = generate_rollouts(
            comps.model,
            comps.tokenizer,
            batch=next(iter(eval_data)),
            batch_size=eval_dataset.num_rows,
            group_size=npass,
            sampling_params=comps.sampling_params,
            show_tqdm=True,
        )

        # now we go and determine the passing rate
        percent_scores = []
        for sample in samples:
            passing_rate = sum(1 if r.is_correct else 0 for r in sample.rollouts) / len(sample.rollouts)
            percent_scores.append(passing_rate)
        # Calculate statistics
        percent_above_50 = sum(1 if score > 0.5 else 0 for score in percent_scores) / len(percent_scores) * 100
        percent_at_100 = sum(1 if score == 1.0 else 0 for score in percent_scores) / len(percent_scores) * 100

        results.append((npass, percent_above_50, percent_at_100))

    # Print all results at the end
    typer.secho("\n=== Evaluation Scorecard ===", fg=typer.colors.BRIGHT_MAGENTA)
    typer.secho(f"Total samples evaluated: {len(samples)}", fg=typer.colors.BRIGHT_BLUE)
    for npass, percent_above_50, percent_at_100 in results:
        typer.secho(
            f"Pass@{npass}: {percent_above_50:.1f}% above 50% | {percent_at_100:.1f}% at 100% (across {len(samples)} samples with {npass} rollout(s) each)",
            fg=typer.colors.CYAN,
        )

    if return_metrics and results:
        _, above_50, at_100 = results[0]  # Return first pass@k metrics
        return {
            "above_50": above_50,
            "at_100": at_100,
            "samples": len(samples),
        }
    return None


@app.command()
def eval(
    eval_path: str = typer.Option(..., "--eval-path", help="Path to the evaluation dataset (jsonl)"),
    model_name: str = typer.Option(..., "--model", "-m", help="Model name or path"),
    gpu: int = typer.Option(0, "--gpu", "-g", help="CUDA GPU index to use"),
    max_new_tokens: int = typer.Option(128, help="Maximum number of new tokens to generate"),
    max_seq_len: int = typer.Option(8192, "--msl", "--max-seq-len", help="Maximum sequence length"),
    temperature: float = typer.Option(0.7, "-t", "--temp", help="Sampling temperature"),
    group_size: int = typer.Option(1, "-G", "--group-size", help="Number of rollouts per prompt (for pass@k)"),
):
    """Run evaluation on a dataset without training."""
    device = torch.device("cuda", gpu)

    eval_dataset = datasets.load_dataset("json", data_files=eval_path, split="train")
    typer.secho(f"✓ Loaded {len(eval_dataset)} evaluation samples", fg=typer.colors.GREEN)

    model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if tokenizer.pad_token_id and not model.config.pad_token_id:
        model.config.pad_token_id = tokenizer.pad_token_id

    sampling_params = SamplingParams(
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        max_tokens=max_seq_len,
        top_p=1.0,
        top_k=0.0,
        repetition_penalty=1.0,
    )

    eval_data = eval_dataset.batch(eval_dataset.num_rows)
    samples = generate_rollouts(
        model,
        tokenizer,
        batch=next(iter(eval_data)),
        batch_size=eval_dataset.num_rows,
        group_size=group_size,
        sampling_params=sampling_params,
        show_tqdm=True,
    )

    percent_scores = []
    for sample in samples:
        passing_rate = sum(1 if r.is_correct else 0 for r in sample.rollouts) / len(sample.rollouts)
        percent_scores.append(passing_rate)

    percent_above_50 = sum(1 if score > 0.5 else 0 for score in percent_scores) / len(percent_scores) * 100
    percent_at_100 = sum(1 if score == 1.0 else 0 for score in percent_scores) / len(percent_scores) * 100

    typer.secho("\n=== Evaluation Results ===", fg=typer.colors.BRIGHT_MAGENTA)
    typer.secho(f"Model: {model_name}", fg=typer.colors.BRIGHT_BLUE)
    typer.secho(f"Samples: {len(samples)}", fg=typer.colors.BRIGHT_BLUE)
    typer.secho(
        f"Pass@{group_size}: {percent_above_50:.1f}% above 50% | {percent_at_100:.1f}% at 100%",
        fg=typer.colors.CYAN,
    )


def check_model_health(model: PreTrainedModel) -> bool:
    """Check if model weights contain NaN or Inf values."""
    for name, param in model.named_parameters():
        if torch.isnan(param).any() or torch.isinf(param).any():
            typer.secho(f"WARNING: NaN/Inf detected in model parameter: {name}", fg=typer.colors.RED)
            return False
    return True


def train_policy_on_rollouts(
    samples: list[Sample],
    comps: TrainingComponents,
    use_wandb: bool = False,
    global_step: int = 0,
    use_packed: bool = False,
    max_tokens_per_microbatch: int = 0,
    current_optim_step: int = 0,
    max_steps: int = 0,
) -> tuple[int, bool]:
    """
    Train the policy model on generated rollouts using GRPO.

    Args:
        samples: List of Sample objects containing rollouts
        comps: Training components
        use_wandb: Whether to log to wandb
        global_step: Current global training step
        use_packed: Use padding-free packed sequences (requires Flash Attention 2)
        max_tokens_per_microbatch: Max tokens per microbatch (0 = no limit, process full batch)
        current_optim_step: Current optimizer step count
        max_steps: Maximum optimizer steps (0 = no limit)

    Returns:
        Tuple of (updated_optim_step, should_stop) where should_stop is True if max_steps reached
    """
    comps.model.train()

    # Create dataset from rollouts
    dataset = dataset_from_groups(samples, comps.train_tokenizer)

    # Track optimizer steps
    optim_step = current_optim_step

    # Training loop over inner epochs
    for epoch in range(comps.hyperparams.inner_epochs):
        data_loader = create_grpo_data_loader(dataset, comps, use_packed=use_packed)

        for batch in data_loader:
            # Clear cache at start of each batch
            torch.cuda.empty_cache()

            # Split batch into microbatches if max_tokens specified
            if max_tokens_per_microbatch > 0:
                microbatches = list(split_batch_into_microbatches(batch, max_tokens_per_microbatch))
            else:
                # No splitting - process full batch
                batch["total_tokens_in_batch"] = batch["num_tokens"]
                batch["num_microbatches"] = 1
                microbatches = [batch]

            num_microbatches = len(microbatches)
            accumulated_loss = 0.0
            accumulated_metrics = {"kl_div": 0.0, "importance_ratio": 0.0}
            valid_microbatches = 0

            # Accumulate gradients across microbatches
            for micro_idx, microbatch in enumerate(microbatches):
                if use_packed:
                    grpo_loss, metrics = _train_step_packed(microbatch, comps)
                else:
                    grpo_loss, metrics = _train_step_padded(microbatch, comps)

                # Check for NaN/Inf in loss before backward
                if torch.isnan(grpo_loss) or torch.isinf(grpo_loss):
                    typer.secho(
                        f"WARNING: NaN/Inf loss detected in microbatch {micro_idx + 1}/{num_microbatches}! Skipping. "
                        f"KL: {metrics.get('kl_div', 'N/A')}, IR: {metrics.get('importance_ratio', 'N/A')}",
                        fg=typer.colors.RED,
                    )
                    continue

                # Scale loss by number of microbatches for correct gradient accumulation
                scaled_loss = grpo_loss / num_microbatches
                scaled_loss.backward()

                accumulated_loss += grpo_loss.item()
                accumulated_metrics["kl_div"] += metrics["kl_div"]
                accumulated_metrics["importance_ratio"] += metrics["importance_ratio"]
                valid_microbatches += 1

                # Clear intermediate tensors
                del grpo_loss, scaled_loss
                torch.cuda.empty_cache()

            # Skip optimizer step if no valid microbatches
            if valid_microbatches == 0:
                comps.optimizer.zero_grad()
                continue

            # Average metrics
            avg_loss = accumulated_loss / valid_microbatches
            avg_metrics = {k: v / valid_microbatches for k, v in accumulated_metrics.items()}

            # Check for NaN in gradients before optimizer step
            has_nan_grad = False
            for name, param in comps.model.named_parameters():
                if param.grad is not None and (torch.isnan(param.grad).any() or torch.isinf(param.grad).any()):
                    typer.secho(f"WARNING: NaN/Inf gradient in {name}! Skipping optimizer step.", fg=typer.colors.RED)
                    has_nan_grad = True
                    break

            if has_nan_grad:
                comps.optimizer.zero_grad()
                continue

            # Gradient clipping and optimization
            gradnorm = torch.nn.utils.clip_grad_norm_(comps.model.parameters(), 1.0)
            comps.optimizer.step()
            comps.optimizer.zero_grad()
            optim_step += 1

            # Clear cache after optimizer step
            torch.cuda.empty_cache()

            # Log metrics (including KL divergence)
            kl_div = avg_metrics.get("kl_div", 0.0)
            ir_mean = avg_metrics.get("importance_ratio", 1.0)
            typer.secho(
                f"Inner Epoch {epoch + 1}/{comps.hyperparams.inner_epochs} | "
                f"Step {optim_step} | "
                f"Loss: {avg_loss:.4f} | "
                f"KL: {kl_div:.4f} | "
                f"IR: {ir_mean:.4f} | "
                f"Grad Norm: {gradnorm.item():.4f}",
                fg=typer.colors.YELLOW,
            )

            # Log to wandb if enabled
            if use_wandb and wandb is not None:
                wandb.log(
                    {
                        "train/loss": avg_loss,
                        "train/grad_norm": gradnorm.item(),
                        "train/kl_divergence": avg_metrics["kl_div"],
                        "train/importance_ratio_mean": avg_metrics["importance_ratio"],
                        "train/microbatches": num_microbatches,
                        "train/optim_step": optim_step,
                    },
                    step=optim_step,
                )

            # Check if we've reached max_steps
            if max_steps > 0 and optim_step >= max_steps:
                return optim_step, True

        # Clear cache after each inner epoch
        torch.cuda.empty_cache()

    return optim_step, False


def _train_step_padded(batch: dict, comps: TrainingComponents) -> tuple[torch.Tensor, dict]:
    """Training step for padded (standard) batch format with mixed precision."""
    # Send everything to GPU
    input_ids = batch["input_ids"].to(comps.device)
    advantages = batch["advantages"].to(comps.device)
    old_logprobs = batch["logprobs"].to(comps.device)
    old_logprob_ids = batch["logprob_ids"].to(comps.device)
    rollout_lens = batch["rollout_lens"].to(comps.device)
    attn_mask = batch["attention_mask"].to(comps.device)
    grpo_logit_mask = batch["grpo_mask"].to(comps.device)

    # Forward pass on policy model with autocast (FP32 weights, BF16 forward)
    with torch.cuda.amp.autocast(dtype=torch.bfloat16):
        new_outputs = comps.model(input_ids=input_ids, attention_mask=attn_mask)
        new_logits = new_outputs.logits

    # Temperature scaling
    if comps.sampling_params.temperature > 0:
        new_logits = new_logits / comps.sampling_params.temperature

    # Forward pass on frozen reference model (already in BF16)
    # Compute ref logprobs immediately and discard ref_logits to save memory
    gather_indices = old_logprob_ids.unsqueeze(-1)  # (B, T) -> (B, T, 1)
    with torch.no_grad():
        ref_outputs = comps.ref_model(input_ids, attention_mask=attn_mask)
        ref_logits = ref_outputs.logits
        if comps.sampling_params.temperature > 0:
            ref_logits = ref_logits / comps.sampling_params.temperature

        # Compute ref logprobs in original dtype to avoid massive FP32 allocation
        # logsumexp is numerically stable, cast only the final scalar result
        ref_gathered = ref_logits.gather(dim=-1, index=gather_indices)
        ref_logsumexp = ref_logits.logsumexp(dim=-1, keepdim=True)
        ref_logprobs = (ref_gathered - ref_logsumexp).squeeze(-1).float()
        del ref_logits, ref_outputs
        torch.cuda.empty_cache()

    # Compute policy logprobs in original dtype to avoid massive FP32 allocation
    new_gathered = new_logits.gather(dim=-1, index=gather_indices)
    new_logsumexp = new_logits.logsumexp(dim=-1, keepdim=True)
    new_logprobs = (new_gathered - new_logsumexp).squeeze(-1).float()
    del new_logits, new_gathered, new_logsumexp, new_outputs

    # Importance ratio (keep in FP32 for stability with exp)
    # Clamp log ratio to prevent exp() from exploding/underflowing
    log_ratio = (new_logprobs - old_logprobs.float()).clamp(-20, 20)
    importance_ratio = log_ratio.exp()

    # Clipped surrogate objective
    advantages = advantages.unsqueeze(-1)  # (B,) -> (B, 1)
    unclipped = advantages * importance_ratio
    clipped = advantages * importance_ratio.clamp(1 - comps.hyperparams.eps, 1 + comps.hyperparams.eps)
    clipped_surrogate = torch.minimum(unclipped, clipped)

    # KL penalty with numerical stability
    # Clamp log diff before exp() to prevent overflow
    log_diff = (ref_logprobs - new_logprobs).clamp(-20, 20)
    dkl_approx = log_diff.exp() - log_diff - 1
    # KL should be non-negative; clamp to prevent outliers from dominating
    dkl_approx = dkl_approx.clamp(min=0, max=100)

    # Per-token loss
    per_token_loss = clipped_surrogate - comps.hyperparams.kl_penalty_strength * dkl_approx
    grpo_token_loss = per_token_loss * grpo_logit_mask.float()

    # Sequence-level averaging (clamp rollout_lens to avoid division by zero)
    safe_rollout_lens = rollout_lens.float().clamp(min=1.0)
    grpo_sequence_loss = grpo_token_loss.sum(dim=-1) / safe_rollout_lens
    grpo_loss = -grpo_sequence_loss.mean()

    # Check for NaN in intermediate values for debugging
    metrics = {
        "kl_div": dkl_approx.mean().item() if not torch.isnan(dkl_approx).any() else float("nan"),
        "importance_ratio": importance_ratio.mean().item() if not torch.isnan(importance_ratio).any() else float("nan"),
    }
    return grpo_loss, metrics


def _train_step_packed(batch: dict, comps: TrainingComponents) -> tuple[torch.Tensor, dict]:
    """