data_path=""
model_path=""
for para in $*
do
if [[ $para == --data_path* ]];then
data_path=`echo ${para#*=}`
elif [[ $para == --model_path* ]];then
model_path=`echo ${para#*=}`
fi
done
if [[ $data_path == "" ]];then
echo "[Error] para \"data_path\" must be config"
exit 1
fi
if [[ $model_path == "" ]];then
echo "[Error] para \"model_path\" must be config"
exit 1
fi
ENGINE=vllm
export VLLM_USE_V1=1
export HCCL_CONNECT_TIMEOUT=5400
export USE_OPTIMIZED_MODEL=0
max_prompt_length=1024
max_response_length=2048
max_num_batched_tokens=8192
free_cache_engine=True
gpu_memory_utilization=0.5
tensor_model_parallel_size=4
enable_chunked_prefill=True
enforce_eager=False
train_batch_size=512
ppo_mini_batch_size=32
ppo_micro_batch_size_per_gpu=4
log_prob_micro_batch_size_per_gpu=4
use_remove_padding=True
ignore_eos=False
enable_gradient_checkpointing=True
nnodes=1
n_gpus_per_node=8
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$data_path/train.parquet \
data.val_files=$data_path/test.parquet \
data.train_batch_size=$train_batch_size \
data.max_prompt_length=$max_prompt_length \
data.max_response_length=$max_response_length \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.image_key=images \
data.shuffle=False \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=$use_remove_padding \
actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=$ppo_micro_batch_size_per_gpu \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.01 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.use_torch_compile=False \
actor_rollout_ref.model.enable_gradient_checkpointing=$enable_gradient_checkpointing \
critic.strategy=fsdp2 \
actor_rollout_ref.actor.strategy=fsdp2 \
actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.actor.fsdp_config.forward_prefetch=False \
actor_rollout_ref.ref.fsdp_config.forward_prefetch=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=$log_prob_micro_batch_size_per_gpu \
actor_rollout_ref.rollout.tensor_model_parallel_size=$tensor_model_parallel_size \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.ignore_eos=$ignore_eos \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=$gpu_memory_utilization \
actor_rollout_ref.rollout.enforce_eager=$enforce_eager \
actor_rollout_ref.rollout.free_cache_engine=$free_cache_engine \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=$log_prob_micro_batch_size_per_gpu \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger=console \
trainer.project_name='verl_grpo_example_geo3k' \
trainer.experiment_name='qwen3_vl_8b_function_rm' \
trainer.n_gpus_per_node=$n_gpus_per_node \
trainer.nnodes=$nnodes \
trainer.save_freq=20 \
trainer.test_freq=-1 \
trainer.total_epochs=25 \
trainer.total_training_steps=100 \
trainer.device=npu \
trainer.val_before_train=False \
actor_rollout_ref.rollout.enable_chunked_prefill=$enable_chunked_prefill \
actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$max_num_batched_tokens | tee train_qwen3_vl_8b_grpo_full.log 2>&1 &
wait
TPS=`grep 'perf/throughput:' train_qwen3_vl_8b_grpo_full.log | awk -F 'perf/throughput:' '{print$2}' | awk -F ' ' '{print$1}' | tail -n 5 | awk '{sum+=$1} END {print sum/NR}'`
echo "Final Performance TPS : $TPS"