#!/bin/bash
cur_path=`pwd`
cur_path_last_dirname=${cur_path##*/}
if [ x"${cur_path_last_dirname}" == x"test" ];then
test_path_dir=${cur_path}
cd ..
cur_path=`pwd`
else
test_path_dir=${cur_path}/test
fi
export RANK_SIZE=8
RANK_ID_START=0
Network="MatrixVT_for_PyTorch"
train_epochs=24
batch_size=8
learning_rate=0.000003125
data_path=""
if [ -d ${cur_path}/outputs/ ];then
rm -rf ${cur_path}/outputs/
fi
ckpt_path="./outputs/matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema_cbgs/lightning_logs/version_0/$(($train_epochs-1)).pth"
ASCEND_DEVICE_ID=0
if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then
rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID}
mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID/
else
mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
fi
check_etp_flag=`env | grep etp_running_flag`
etp_flag=`echo ${check_etp_flag#*=}`
if [ x"${etp_flag}" != x"true" ];then
source ${test_path_dir}/env_npu.sh
fi
start_time=$(date +%s)
cd $cur_path/
nohup python3 ./bevdepth/exps/nuscenes/MatrixVT/matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema.py \
--seed 0 \
--learning_rate ${learning_rate} \
--max_epoch ${train_epochs} \
--amp_backend 'native' \
--gpus ${RANK_SIZE} \
--precision 16 \
--batch_size_per_device ${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
wait
end_time=$(date +%s)
e2e_time=$(( $end_time - $start_time ))
echo "------------------ Start eval ------------------"
nohup python3 ./bevdepth/exps/nuscenes/MatrixVT/matrixvt_bev_depth_lss_r50_256x704_128x128_24e_ema.py \
-e \
--ckpt_path ${ckpt_path} \
--seed 0 \
--learning_rate ${learning_rate} \
--max_epoch ${train_epochs} \
--amp_backend 'native' \
--gpus ${RANK_SIZE} \
--precision 16 \
--batch_size_per_device ${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log 2>&1 &
wait
echo "------------------ Final result ------------------"
step_time=`grep -o 'step_time=\s*[0-90]\+\(\.[0-9]\+\)\?\s*' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| sed 's/step_time= //g' | tail -n 5000 | awk '{sum += $1} END {avg = sum / NR; print avg}'`
FPS=`awk 'BEGIN{printf "%d\n", '$batch_size'/'$step_time'*'$RANK_SIZE'}'`
train_iteration=`grep 'step_time' $test_path_dir/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tail -1 | awk -F 'id ' '{print $2}' | awk -F ' step' '{print $1}'`
train_iteration=$((train_iteration + 1))
EpochTime=`awk 'BEGIN{printf "%.2f\n", '$step_time'*'$train_iteration'/'3600'}'`
if [ x"${FPS}" == x"2147483647" ] || [ x"${FPS}" == x"-2147483647" ];then
FPS=""
fi
echo "Final Performance images/sec : $FPS"
echo "Final Performance each_epoch_time : $EpochTime h"
echo "E2E Training Duration sec : $e2e_time"
BatchSize=${batch_size}
DeviceType=`uname -m`
CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
ActualFPS=${FPS}
TrainingTime=`awk 'BEGIN{printf "%df\n",''3600*'${EpochTime}'}'`
metric_names=("mAP" "mATE" "mASE" "mAOE" "mAVE" "mAAE" "NDS")
for metric_name in "${metric_names[@]}"; do
metric_value=$(grep -o "${metric_name}:\s*[0-9]\+\(\.[0-9]\+\)\?\s*" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/eval_${ASCEND_DEVICE_ID}.log | awk -F': ' '{print $2}')
echo "${metric_name}: ${metric_value}"
echo "${metric_name} = ${metric_value}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
done
echo "Network = ${Network}" > $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "RankSize = ${RANK_SIZE}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "BatchSize = ${BatchSize}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "DeviceType = ${DeviceType}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "CaseName = ${CaseName}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "ActualFPS = ${ActualFPS}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "TrainingTime = ${TrainingTime}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log
echo "E2ETrainingTime = ${e2e_time}" >> $test_path_dir/output/$ASCEND_DEVICE_ID/${CaseName}.log