export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export HYDRA_FULL_ERROR=1
export ASCEND_LAUNCH_BLOCKING=0
export RAY_DEDUP_LOGS=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
export ASCEND_GLOBAL_EVENT_ENABLE=0
export ASCEND_SLOG_PRINT_TO_STDOUT=0
export ASCEND_GLOBAL_LOG_LEVEL=3
export HCCL_CONNECT_TIMEOUT=360
export HCCL_EXEC_TIMEOUT=360
export HCCL_IF_BASE_PORT=64033
export CUDA_DEVICE_MAX_CONNECTIONS=1
set -x
nnodes=1
nproc_per_node=16
project_name=retool_sft
experiment_name=multiturn-sft-qwen-3-4b-instruct
TRAIN_DATA=ReTool-SFT/data/train-00000-of-00001.parquet
EVAL_DATA=ReTool-SFT/data/train-00000-of-00001.parquet
MODEL_PATH=ReTool-SFT/model/Qwen3-4B-Instruct-2507
SAVE_PATH=checkpoint/$experiment_name
torchrun --nnodes=$nnodes \
--nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$TRAIN_DATA \
data.val_files=$EVAL_DATA \
data.max_length=16384 \
data.train_batch_size=64 \
data.multiturn.enable=true \
data.multiturn.messages_key=messages \
data.multiturn.tools_key=tools \
data.micro_batch_size_per_gpu=8 \
model.partial_pretrain=$MODEL_PATH \
model.strategy=fsdp \
trainer.default_local_dir=$SAVE_PATH \
trainer.project_name=$project_name \
trainer.experiment_name=$experiment_name \
trainer.logger='["console", "tensorboard"]' \
trainer.total_epochs=6 \
trainer.save_freq=31 \
trainer.device=npu \
ulysses_sequence_parallel_size=4 \
use_remove_padding=true 2>&1 | tee log/sft_run_log/multiturn-sft-qwen-3-4b-instruct.log