set -x
export VLLM_ASCEND_ENABLE_NZ=0
MAX_PROMPT_LENGTH=2048
RES_LENGTH=8192
ROLLOUT_BATCH_SIZE=128
PPO_MINI_BATCH_SIZE=32
TRAIN_TEMPERATURE=0.9
GROUP_SIZE=8
NNODES=1
MAX_TOKEN_LEN=$(((RES_LENGTH + MAX_PROMPT_LENGTH) / SP))
MODEL_PATH=/data/Qwen3-4B-Instruct-2507
PROJECT_NAME="verl_sandbox_code_rl"
EXP_NAME_BASE=4B_L$(($RES_LENGTH / 1024))k
MODEL_NAME=$(basename $MODEL_PATH)
EXP_NAME=${EXP_NAME_BASE}-${MODEL_NAME}-bs${ROLLOUT_BATCH_SIZE}-minibs${PPO_MINI_BATCH_SIZE}-gs${GROUP_SIZE}-temp${TRAIN_TEMPERATURE}-${NNODES}nodes
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
reward_model.reward_manager=prime \
data.train_files=/data/verifiable-coding-problems-python-only/train.parquet \
data.val_files=/data/verifiable-coding-problems-python-only/validation.parquet \
data.train_batch_size=$ROLLOUT_BATCH_SIZE \
data.max_prompt_length=$MAX_PROMPT_LENGTH \
data.max_response_length=$RES_LENGTH \
data.filter_overlong_prompts=True \
data.truncation='error' \
custom_reward_function.path=scalebox.py \
custom_reward_function.name=compute_score \
+custom_reward_function.reward_kwargs.sandbox_fusion_url='http://0.0.0.0:8080/common_evaluate_batch' \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_mini_batch_size=$PPO_MINI_BATCH_SIZE \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.clip_ratio=0.2 \
actor_rollout_ref.actor.clip_ratio_high=0.28 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.actor.use_torch_compile=False \
actor_rollout_ref.ref.use_torch_compile=False \
actor_rollout_ref.rollout.mode=sync \
actor_rollout_ref.rollout.max_num_batched_tokens=$MAX_TOKEN_LEN \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.n=$GROUP_SIZE \
actor_rollout_ref.rollout.max_model_len=$MAX_TOKEN_LEN \
actor_rollout_ref.rollout.temperature=$TRAIN_TEMPERATURE \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name=$PROJECT_NAME \
trainer.experiment_name=$EXP_NAME \
trainer.n_gpus_per_node=16 \
trainer.nnodes=$NNODES \
trainer.save_freq=50 \
trainer.test_freq=50 \
trainer.total_epochs=15 \
trainer.device=npu $@