pkill -9 python
ray stop --force
rm -rf /tmp/ray
source /usr/local/Ascend/cann/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export HCCL_CONNECT_TIMEOUT=5400
export USE_OPTIMIZED_MODEL=0
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=300
export TASK_QUEUE_ENABLE=1
export COMBINED_ENABLE=1
export PYTORCH_NPU_ALLOC_CONF="garbage_collection_threshold:0.85"
export MULTI_STREAM_MEMORY_REUSE=1
export TOKENIZERS_PARALLELISM=false
export HYDRA_FULL_ERROR=1
export RAY_DEDUP_LOGS=0
export VLLM_ASCEND_ENABLE_NZ=0
ulimit -n 32768
NNODES=1
NPUS_PER_NODE=16
MASTER_ADDR="IP FOR MASTER NODE"
SOCKET_IFNAME="Your SOCKET IFNAME"
export HCCL_SOCKET_IFNAME=$SOCKET_IFNAME
export GLOO_SOCKET_IFNAME=$SOCKET_IFNAME
CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}'
while true; do
ray_status_output=$(ray status)
npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
device_count=$((npu_count_int / $NPUS_PER_NODE))
if [ "$device_count" -eq "$NNODES" ]; then
echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
ray status
break
else
echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
sleep 5
fi
done
else
while true; do
sleep 5
ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
ray status
if [ $? -eq 0 ]; then
echo "Successfully connected to the Ray cluster!"
break
else
echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
sleep 5
fi
done
fi