source scripts/set_npu_env.sh
export RANK_SIZE=8

for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
do
    export RANK=$RANK_ID

    if [ $(uname -m) = "aarch64" ]
    then
        kernel_num=$(($(nproc) / $RANK_SIZE))
        pid_start=$((kernel_num * rank))
        pid_end=$((pid_start + kernel_num - 1))

        taskset -c $pid_start-$pid_end python3 -u train_ssd.py \
          --dataset_type voc  \
          --data_path /opt/npu/voc \
          --net mb2-ssd-lite \
          --base_net models/mb2-imagenet-71_8.pth  \
          --scheduler cosine \
          --lr 0.08 \
          --batch_size 32 \
          --t_max 200 \
          --validation_epochs 5 \
          --checkpoint_folder models/8p  \
          --eval_dir  models/8p/eval \
          --num_epochs 200  \
          --debug_steps 1 \
          --amp \
          --distributed \
          --rank $RANK_ID  \
          --warm_up \
          --warm_up_epochs 5 \
          --stay_lr 1 \
          --device_list '0,1,2,3,4,5,6,7' \
          --dist_backend 'hccl' \
          --device npu  > models/8p/log.txt 2>&1 &
    else
        python3 -u train_ssd.py \
          --dataset_type voc  \
          --data_path /opt/npu/voc \
          --net mb2-ssd-lite \
          --base_net models/mb2-imagenet-71_8.pth  \
          --scheduler cosine \
          --lr 0.08 \
          --batch_size 32 \
          --t_max 200 \
          --validation_epochs 5 \
          --checkpoint_folder models/8p  \
          --eval_dir  models/8p/eval \
          --num_epochs 200  \
          --debug_steps 1 \
          --amp \
          --distributed \
          --rank $RANK_ID  \
          --warm_up \
          --warm_up_epochs 5 \
          --stay_lr 1 \
          --device_list '0,1,2,3,4,5,6,7' \
          --dist_backend 'hccl' \
          --device npu > models/8p/log.txt 2>&1 &
    fi
done