#!/bin/bash
DIR=`pwd`
Network="Qwen-VL"
model_name="Qwen/Qwen-VL-Chat"
data_path="path_to_data"
batch_size=1
gradient_accumulation_steps=16
model_max_length=2048
epochs=5
output_path="./output-qwen-vl"
GPUS_PER_NODE=8
NNODES=1
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=6001
for para in $*
do
if [[ $para == --batch_size* ]];then
batch_size=`echo ${para#*=}`
elif [[ $para == --model_name* ]];then
model_name=`echo ${para#*=}`
elif [[ $para == --data_path* ]];then
data_path=`echo ${para#*=}`
elif [[ $para == --gradient_accumulation_steps* ]];then
gradient_accumulation_steps=`echo ${para#*=}`
elif [[ $para == --epochs* ]];then
epochs=`echo ${para#*=}`
elif [[ $para == --output_path* ]];then
output_path=`echo ${para#*=}`
fi
done
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
DIR=`pwd`
cur_path_last_diename=${DIR##*/}
if [ x"${cur_path_last_diename}" == x"test" ];then
test_path_dir=${DIR}
cd ..
DIR=`pwd`
else
test_path_dir=${DIR}/test
fi
ASCEND_DEVICE_ID="8p"
if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
else
mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
fi
check_etp_flag=$(env | grep etp_running_flag)
etp_flag=$(echo ${check_etp_flag#*=})
if [ x"${etp_flag}" != x"true" ]; then
source ${test_path_dir}/env_npu.sh
fi
start_time=$(date +%s)
torchrun $DISTRIBUTED_ARGS finetune.py \
--model_name_or_path $model_name \
--data_path $data_path \
--bf16 True \
--fix_vit True \
--output_dir $output_path \
--num_train_epochs $epochs \
--per_device_train_batch_size $batch_size \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps $gradient_accumulation_steps \
--evaluation_strategy "no" \
--save_strategy "epoch" \
--save_total_limit 2 \
--learning_rate 1e-5 \
--weight_decay 0.1 \
--adam_beta2 0.95 \
--warmup_ratio 0.01 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--report_to "none" \
--model_max_length $model_max_length \
--gradient_checkpointing True \
--lazy_preprocess True \
--dataloader_pin_memory True \
--dataloader_num_workers 8 \
--deepspeed finetune/ds_config_zero2.json > ${test_path_dir}/output/8p/train_full_8p.log 2>&1 &
wait
end_time=$(date +%s)
e2e_time=$(( $end_time - $start_time ))
echo "------------------ Final result ------------------"
FPS=`grep -a 's/it' ${test_path_dir}/output/8p/train_full_8p.log|awk -F ', ' '{print $2}'|awk -F 's/it' '{print $1}'|tail -n 10|awk '{sum+=$1} END {print sum/NR}'`
ActualFPS=$(echo "scale=2; $model_max_length*$batch_size*$gradient_accumulation_steps/$FPS" | bc)
echo "Final Performance images/sec : $ActualFPS"
echo "E2E Training Duration sec : $e2e_time"
BatchSize=${batch_size}
DeviceType=`uname -m`
CaseName=${Network}_bs${BatchSize}_'full'
grep -a "'loss':" ${test_path_dir}/output/8p/train_full_8p.log|awk -F ': ' '{print $2}' > ${test_path_dir}/output/8p/train_${CaseName}_loss.txt
ActualLoss=`awk 'END {print}' ${test_path_dir}/output/8p/train_${CaseName}_loss.txt`
echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log