Skip to content

训练途中突然崩了,无论是grpo还是reinforce++,出现nan #30

@chuangzhidan

Description

@chuangzhidan

(main_task pid=63313) step:83 - global_seqlen/min:22900.000 - global_seqlen/max:23135.000 - global_seqlen/minmax_diff:235.000 - global_seqlen/balanced_min:23017.000 - global_seqlen/balanced_max:23018.000 - global_seqlen/mean:23017.500 - state_tokens/total:20560.000 - state_tokens/coverage:1.000 - actor/kl_loss:nan - actor/kl_coef:0.001 - actor/entropy_loss:nan - actor/pg_loss:nan - actor/pg_clipfrac:0.000 - actor/ppo_kl:nan - actor/grad_norm:nan - mfu/actor:0.041 - actor/lr:0.000 - critic/score/mean:-1.000 - critic/score/max:-1.000 - critic/score/min:-1.000 - critic/rewards/mean:-1.000 - critic/rewards/max:-1.000 - critic/rewards/min:-1.000 - critic/advantages/mean:0.000 - critic/advantages/max:0.000 - critic/advantages/min:0.000 - critic/returns/mean:-1.000 - critic/returns/max:-1.000 - critic/returns/min:-1.000 - critic/process_rewards/mean:0.000 - critic/process_rewards/max:0.000 - critic/process_rewards/min:0.000 - critic/process_rewards/count:0.000 - critic/format_score/mean:0.000 - critic/format_score/max:0.000 - critic/format_score/min:0.000 - critic/answer_score/mean:0.000 - critic/answer_score/max:0.000 - critic/answer_score/min:0.000 - turns/mean:0.000 - turns/max:0.000 - turns/min:0.000 - response_length/mean:257.000 - response_length/max:257.000 - response_length/min:257.000 - response_length/clip_ratio:1.000 - prompt_length/mean:318.438 - prompt_length/max:338.000 - prompt_length/min:306.000 - prompt_length/clip_ratio:0.000 - timing_s/gen:136.423 - timing_s/ref:10.226 - timing_s/adv:0.110 - timing_s/update_actor:38.082 - timing_s/step:193.018 - timing_per_token_ms/adv:0.002 - timing_per_token_ms/ref:0.222 - timing_per_token_ms/update_actor:0.827 - timing_per_token_ms/gen:6.635
(main_task pid=63313) ACTIVE_TRAJ_NUM: [80, 0]

Image
Image
Image
Image

export VLLM_ATTENTION_BACKEND=XFORMERS
export BASE_MODEL='/data/Qwen2.5-3B-Instruct'
export PROJECT_NAME='hotpotqa_qwen2.5-3b-instruct'
export EXPERIMENT_NAME=reinforce_plus_plus
export HYDRA_FULL_ERROR=1
export CUDA_LAUNCH_BLOCKING=1
export CUDA_VISIBLE_DEVICES=2,3
export WANDB_API_KEY='029a79963667cd```````````````````2de9a9'
export WANDB_PROJECT="agent-r1_0331"

python3 -m verl.trainer.main_ppo
algorithm.adv_estimator=reinforce_plus_plus
data.train_files=./data/hotpotqa/train.parquet
data.val_files=./data/hotpotqa/validation.parquet
data.train_batch_size=16
data.max_prompt_length=4096
data.max_response_length=2048
data.max_start_length=4096
data.max_tool_response_length=2048
actor_rollout_ref.model.path=$BASE_MODEL
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.model.use_remove_padding=True
actor_rollout_ref.actor.ppo_mini_batch_size=8
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.001
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.model.enable_gradient_checkpointing=True
actor_rollout_ref.actor.fsdp_config.param_offload=False
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
actor_rollout_ref.rollout.tensor_model_parallel_size=2
actor_rollout_ref.rollout.name=vllm
actor_rollout_ref.rollout.gpu_memory_utilization=0.4
actor_rollout_ref.rollout.n_repeat=5
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2
actor_rollout_ref.ref.fsdp_config.param_offload=True
algorithm.kl_ctrl.kl_coef=0.001
trainer.critic_warmup=0
trainer.logger=['console','wandb']
trainer.project_name=$PROJECT_NAME
trainer.experiment_name=$EXPERIMENT_NAME
trainer.n_gpus_per_node=2
trainer.nnodes=1
trainer.save_freq=-1
trainer.test_freq=10
trainer.total_epochs=1
tool.env='search' $@

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions