#!/bin/bash
kill -9 `ps -ef | grep python | grep -v grep | awk '{print $2}'` > /dev/null 2>&1
if [ $# -ge 1 ]; then
py=$1
ip=$2
else
echo "for example: bash run.sh main.py 10.10.10.10 or bash run.sh main.py"
exit 1
fi
case "$py" in
main.py)
;;
*)
echo "invalid py file '$py'"
exit 1
;;
esac
if [ -n "$ip" ]; then
if [[ $ip =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]]; then
ip_array=(${ip//./ })
valid=true
for i in "${ip_array[@]}"; do
if ((i < 0 || i > 255)); then
valid=false
break
fi
done
if $valid; then
echo "ip: $ip is valid"
else
echo "ip: $ip is not valid"
exit 1
fi
else
echo "ip: $ip is not valid."
exit 1
fi
fi
cur_path=`pwd`
rec_package_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/mx_rec
so_path=${rec_package_path}/libasc
common_package_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/rec_sdk_common
common_so_path=${common_package_path}/lib
mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0'
interface="lo"
local_rank_size=1
num_server=1
num_process=$((${num_server} * ${local_rank_size}))
project_root=$(cd "$cur_path/../.." && pwd)
export IGNORE_INFER_ERROR=1
export HCCL_CONNECT_TIMEOUT=1200
export HCCL_OP_RETRY_ENABLE="L0:0, L1:0, L2:0"
export PYTHONPATH=${so_path}:${project_root}:${common_so_path}:$PYTHONPATH
if [ -f /usr/local/gcc11.2.0/lib64/libgomp.so.1 ]; then
export LD_PRELOAD=/usr/local/gcc11.2.0/lib64/libgomp.so.1:/usr/local/gcc11.2.0/lib64/libstdc++.so.6
else
export LD_PRELOAD=/usr/lib64/libgomp.so.1:/usr/lib64/libstdc++.so.6
fi
if [ "$(uname -m)" == "aarch64" ] && [ -f $(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0 ]; then
export LD_PRELOAD=${LD_PRELOAD}:$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0
fi
export LD_LIBRARY_PATH=${so_path}:${common_so_path}:/usr/local/lib:$LD_LIBRARY_PATH
export JOB_ID=10086
export MXREC_LOG_LEVEL="ERROR"
export TF_CPP_MIN_LOG_LEVEL=3
export ASCEND_GLOBAL_LOG_LEVEL=3
export MXREC_MODE="ASC"
export USE_MPI=1
if [[ $1 == --help || $1 == -h ]];then
echo "Usage: ./run.sh [OPTION]... [IP]..."
echo " "
echo "parameter explain:
[OPTION] main.py
[IP] IP address of the host
-h/--help show help message
"
exit 1
fi
function rankTableSolution() {
echo "The ranktable solution"
export RANK_TABLE_FILE="${cur_path}/hccl_json_${local_rank_size}p.json"
export RANK_SIZE=$num_process
export ASCEND_VISIBLE_DEVICES="0"
export RANK_ID=0
export ASCEND_DEVICE_ID=$RANK_ID
echo "RANK_TABLE_FILE=$RANK_TABLE_FILE"
if [ ! -f "$RANK_TABLE_FILE" ];then
echo "the rank table file does not exist. Please reference {hccl_json_${local_rank_size}p.json} to correctly config rank table file"
exit 1
fi
}
if [ ! -n "$ip" ]; then
rankTableSolution
else
VALID_CHECK=$(echo $ip|awk -F. '$1<=255&&$2<=255&&$3<=255&&$4<=255{print "yes"}')
if echo $ip|grep -E "^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$">/dev/null; then
if [ "$VALID_CHECK" == "yes" ]; then
echo "ip: $ip available."
echo "The ranktable solution is removed."
export CM_CHIEF_IP=$ip
export CM_CHIEF_PORT=6000
export CM_CHIEF_DEVICE=0
export CM_WORKER_IP=$ip
export CM_WORKER_SIZE=$num_process
echo "CM_CHIEF_IP=$CM_CHIEF_IP"
echo "CM_CHIEF_PORT=$CM_CHIEF_PORT"
echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE"
echo "CM_WORKER_IP=$CM_WORKER_IP"
echo "CM_WORKER_SIZE=$CM_WORKER_SIZE"
echo "ASCEND_VISIBLE_DEVICES=$ASCEND_VISIBLE_DEVICES"
else
echo "ip: $ip not available!"
rankTableSolution
fi
else
echo "ip: $ip not available!"
rankTableSolution
fi
fi
echo "use horovod to start tasks"
DATE=$(date +%Y-%m-%d-%H-%M-%S)
horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \
python3.7 ${py} 2>&1 | tee "temp_${local_rank_size}p_t_${DATE}.log"