#!/bin/bash
scripts_dir=$(realpath $(dirname $0))
root_dir=$(realpath $(dirname $scripts_dir))
echo "=========set cann env================"
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
source ${scripts_dir}/base/utils.sh
parse_train_conf
if [[ "${WORK_MODE}" == "hybrid" ]]; then
export MASTER_TRAIN_INDEX=0
fi
export RL_TRAIN_BACKEND="verl"
export ACLNN_ALLOW_RUNTIME_CACHE=1
config_vc_hosts
log_info "============================================================"
log_info "VC_TASK_HOSTS: ${VC_TASK_HOSTS}"
log_info "VC_WORKER_HOSTS: ${VC_WORKER_HOSTS}"
log_info "VC_TASK_INDEX: ${VC_TASK_INDEX}"
log_info "MASTER_TRAIN_INDEX: ${MASTER_TRAIN_INDEX}"
log_info "MASTER_INFER_INDEX: ${MASTER_INFER_INDEX}"
log_info "RL_TRAIN_BACKEND: ${RL_TRAIN_BACKEND}"
log_info "============================================================"
check_env
prepare_cluster_info
log_info "after parsing domain, VC_WORKER_HOSTS: ${VC_WORKER_HOSTS}"
rm -rf /tmp/ray/*
function start_train()
{
if [[ -n "${RESUME_TRAIN_CONF_NAME}" ]]; then
real_train_conf_name=${RESUME_TRAIN_CONF_NAME}
else
real_train_conf_name=${TRAIN_CONF_NAME}
fi
if [[ "${WORK_MODE}" != "hybrid" ]]; then
source ${scripts_dir}/infer/vllm/parse_infer_config.sh
get_infer_configs
fi
log_info "start verl train cluster, work_mode: ${WORK_MODE}, config name: ${real_train_conf_name}"
log_info "start mode: $1"
if [[ "$1" == "daemon" ]]; then
(
bash ${scripts_dir}/train/start_verl_train_cluster.sh --config-name ${real_train_conf_name} \
| sed "s/^/[train_cluster] /"
exit ${PIPESTATUS[0]}
) &
pid=$!
wait $pid
exit_code=$?
else
bash ${scripts_dir}/train/start_verl_train_cluster.sh --config-name ${real_train_conf_name} | sed "s/^/[train_cluster] /"
exit_code=${PIPESTATUS[0]}
fi
log_info "start_verl_train_cluster.sh end with exit code $exit_code"
exit $exit_code
}
function start_infer()
{
if [[ "${WORK_MODE}" == "hybrid" ]]; then
log_warn "hybrid mode, skip start external vllm infer cluster"
return
fi
log_info "start external vllm infer cluster"
if [[ -n "${RESUME_INFER_CONF_NAME}" ]]; then
real_infer_conf_name=${RESUME_INFER_CONF_NAME}
else
real_infer_conf_name=${INFER_CONF_NAME}
fi
if [[ "$1" == "daemon" ]]; then
(
bash ${scripts_dir}/infer/start_vllm_infer_cluster.sh --config-name ${real_infer_conf_name} \
| sed "s/^/[infer_cluster] /"
exit ${PIPESTATUS[0]}
) &
pid=$!
wait $pid
exit_code=$?
else
bash ${scripts_dir}/infer/start_vllm_infer_cluster.sh --config-name ${real_infer_conf_name} | sed "s/^/[infer_cluster] /"
exit_code=${PIPESTATUS[0]}
fi
log_info "start_vllm_infer_cluster.sh end with exit code $exit_code"
exit $exit_code
}
NODE_TYPE="NULL"
function get_node_type()
{
if [[ "${VC_TASK_INDEX}" -lt "${MASTER_TRAIN_INDEX}" && -z "${MASTER_INFER_INDEX}" ]]; then
NODE_TYPE="infer"
elif [[ "${VC_TASK_INDEX}" -ge "${MASTER_TRAIN_INDEX}" && -z "${MASTER_INFER_INDEX}" ]]; then
NODE_TYPE="train"
elif [[ "${VC_TASK_INDEX}" -eq "${MASTER_TRAIN_INDEX}" &&
-n "${MASTER_INFER_INDEX}" &&
"${VC_TASK_INDEX}" -eq "${MASTER_INFER_INDEX}" ]]; then
NODE_TYPE="hybrid"
fi
}
function set_infer_visible_devices()
{
yaml_file="${root_dir}/configs/infer/${INFER_CONF_NAME}.yaml"
local num_npus=$(python3 -c "import yaml; \
print(yaml.safe_load(open('${yaml_file}'))['tensor_parallel_size'])")
local total_count=$(npu-smi info -l | grep "Total Count" | awk -F: '{print $2}' | tr -d ' ')
local chip_count=$(npu-smi info -l | grep "Chip Count" | head -n 1 | awk -F: '{print $2}' | tr -d ' ')
local total_cards=$((total_count * chip_count))
export ASCEND_RT_VISIBLE_DEVICES=$(seq 0 $((total_cards - 1)) | tail -n "${num_npus}" | paste -sd "," -)
log_info "for infer, ASCEND_RT_VISIBLE_DEVICES: ${ASCEND_RT_VISIBLE_DEVICES}"
}
function cleanup()
{
echo "receive $1, stopping child processes..."
ray stop
pgid=$(ps -o pgid= -p $$)
pgrep -g $pgid | grep -vE "^ *($pgid|$$) *$" | xargs -r kill -TERM 2>/dev/null
if [[ "$1" == "SIGUSR1" ]]; then
exit 0
fi
}
trap 'cleanup SIGINT' SIGINT
trap 'cleanup SIGTERM' SIGTERM
trap 'cleanup SIGUSR1' SIGUSR1
get_node_type
log_info "NODE_TYPE: ${NODE_TYPE}"
if [[ "${NODE_TYPE}" == "infer" ]]; then
start_infer non_daemon
elif [[ "${NODE_TYPE}" == "train" ]]; then
start_train non_daemon
elif [[ "${NODE_TYPE}" == "hybrid" ]]; then
set_infer_visible_devices
start_infer daemon
unset ASCEND_RT_VISIBLE_DEVICES
start_train daemon
wait
else
log_error "unknown node type: ${NODE_TYPE}"
fi