#!/bin/bash
MOTOR_PATCH_ROOT="/tmp/motor/examples/deployer/patch"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "Current node role: ROLE=$ROLE"
set_common_env
apply_openssl_gen_cert() {
local ca_path=$1
local base_cert_path=$2
local cert_names=$3
local gen_cert_script=$4
local ca_password=${5:-1234qwer}
local cert_password=${6:-5678asdf}
if [ ! -f "$gen_cert_script" ]; then
echo "Error: Certificate generation script not found: $gen_cert_script"
return 1
fi
if [ -z "$cert_names" ]; then
echo "Error: cert_names parameter is required"
echo "Usage: apply_openssl_gen_cert <ca_path> <base_cert_path> <cert_names> <gen_cert_script> [ca_password] [cert_password]"
echo "Example: apply_openssl_gen_cert /path/to/ca /path/to/security \"infer mgmt etcd clusterd\" /path/to/openssl_gen_cert.sh"
return 1
fi
for cert_name in $cert_names; do
local cert_path="${base_cert_path}/${cert_name}"
echo "Generating certificate for: $cert_name"
echo "Certificate path: $cert_path"
mkdir -p "$cert_path"
cat > "${cert_path}/key_pwd.txt" <<- EOF
${cert_password}
EOF
cp "$ca_path/ca.pem" "$ca_path/ca.key.pem" "$cert_path"
bash "$gen_cert_script" "$ca_path" "$cert_path" "$ca_password" "$cert_password"
if [ $? -ne 0 ]; then
echo "Error: Failed to generate certificate for $cert_name"
return 1
fi
echo "Certificate generated successfully for: $cert_name"
echo "---"
done
echo "All certificates generated successfully!"
}
setup_tls_certificates() {
CA_PATH="${CA_PATH:-/mnt/cert_scripts/ca}"
BASE_CERT_PATH="${BASE_CERT_PATH:-/usr/local/Ascend/pyMotor/conf/security}"
CERT_NAMES="${CERT_NAMES:-infer mgmt}"
GEN_CERT_SCRIPT="${GEN_CERT_SCRIPT:-/mnt/cert_scripts/openssl_gen_cert.sh}"
if [ ! -f "$GEN_CERT_SCRIPT" ]; then
echo "Error: Certificate generation script not found: $GEN_CERT_SCRIPT"
echo "Please copy openssl_gen_cert.sh to the specified path or set GEN_CERT_SCRIPT environment variable"
return 1
fi
if [ ! -f "$CA_PATH/ca.pem" ] || [ ! -f "$CA_PATH/ca.key.pem" ]; then
echo "Error: CA certificate not found at $CA_PATH"
echo "Please generate CA certificate first:"
echo " bash /mnt/cert_scripts/openssl_gen_ca.sh /mnt/cert_scripts/ca/"
echo "Or set CA_PATH environment variable to the correct CA certificate path"
return 1
fi
echo "TLS is enabled, generating certificates..."
echo "CA_PATH: $CA_PATH"
echo "BASE_CERT_PATH: $BASE_CERT_PATH"
echo "CERT_NAMES: $CERT_NAMES"
echo "GEN_CERT_SCRIPT: $GEN_CERT_SCRIPT"
apply_openssl_gen_cert "$CA_PATH" "$BASE_CERT_PATH" "$CERT_NAMES" "$GEN_CERT_SCRIPT"
}
if [ -n "$ENABLE_GEN_CERT" ] && [ "$ENABLE_GEN_CERT" = "true" ]; then
setup_tls_certificates
fi
setup_motor_log_path() {
if [ -n "$MOTOR_LOG_ROOT_PATH" ] && [ -n "$MODEL_NAME" ] && [ -n "$SERVICE_ID" ]; then
chmod 750 "$MOTOR_LOG_ROOT_PATH"
if [ ! -d "$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/motor" ]; then
mkdir -p -m 750 "$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/motor"
fi
export MOTOR_LOG_PATH="$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/motor"
fi
}
setup_ascend_work_path() {
if [ -n "$MOTOR_LOG_ROOT_PATH" ] && [ -n "$MODEL_NAME" ] && [ -n "$SERVICE_ID" ]; then
chmod 750 "$MOTOR_LOG_ROOT_PATH"
if [ ! -d "$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/ascend_work_path" ];then
mkdir -p -m 750 "$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/ascend_work_path"
fi
export ASCEND_WORK_PATH="$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/ascend_work_path"
fi
}
setup_ascend_cache_path() {
if [ -n "$MOTOR_LOG_ROOT_PATH" ] && [ -n "$MODEL_NAME" ] && [ -n "$SERVICE_ID" ]; then
chmod 750 "$MOTOR_LOG_ROOT_PATH"
if [ ! -d "$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/ascend_cache_path" ];then
mkdir -p -m 750 "$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/ascend_cache_path"
fi
export ASCEND_CACHE_PATH="$MOTOR_LOG_ROOT_PATH/$MODEL_NAME/$SERVICE_ID/ascend_cache_path"
fi
}
apply_shuffle_safetensors_patch() {
local patch_script="${MOTOR_PATCH_ROOT}/patch_apply_shuffle_safetensors.py"
if [ ! -f "$patch_script" ]; then
echo "Warning: shuffle safetensors patch script not found: $patch_script"
return 0
fi
python3 "$patch_script"
}
setup_jemalloc() {
jemalloc_path=$(find /usr -type f -name "libjemalloc.so.2" 2>/dev/null | head -n 1)
if [[ -n "$jemalloc_path" ]]; then
export LD_PRELOAD="${jemalloc_path}:${LD_PRELOAD}"
echo "jemalloc found at: $jemalloc_path"
echo "LD_PRELOAD is set successfully."
else
echo "Warning: libjemalloc.so.2 not found under /usr"
echo "Please make sure jemalloc is installed."
fi
}
USER_CONFIG_FILE="$CONFIGMAP_PATH/user_config.json"
export USER_CONFIG_PATH="$USER_CONFIG_FILE"
mkdir "$CONFIG_PATH" -p
chmod 750 "$CONFIG_PATH"
USER_CONFIG_DST="$CONFIG_PATH/user_config.json"
CONFIG_SYNC_INTERVAL="${CONFIG_SYNC_INTERVAL:-10}"
CONFIG_SYNC_PID_FILE="$CONFIG_PATH/user_config_sync.pid"
sync_user_config() {
if [ -f "$USER_CONFIG_FILE" ]; then
if [ ! -f "$USER_CONFIG_DST" ] || ! cmp -s "$USER_CONFIG_FILE" "$USER_CONFIG_DST"; then
cp -f "$USER_CONFIG_FILE" "$USER_CONFIG_DST"
chmod 640 "$USER_CONFIG_DST"
fi
export USER_CONFIG_PATH="$USER_CONFIG_DST"
else
export USER_CONFIG_PATH="$USER_CONFIG_FILE"
fi
}
sync_user_config
if [ -f "$USER_CONFIG_FILE" ]; then
if [ -f "$CONFIG_SYNC_PID_FILE" ] && kill -0 "$(cat "$CONFIG_SYNC_PID_FILE")" 2>/dev/null; then
echo "Config sync loop already running (pid=$(cat "$CONFIG_SYNC_PID_FILE"))"
else
(
while true; do
sleep "$CONFIG_SYNC_INTERVAL"
sync_user_config
done
) &
echo "$!" > "$CONFIG_SYNC_PID_FILE"
fi
fi
if [ "$SAVE_CORE_DUMP_FILE_ENABLE" = "1" ]; then
ulimit -c 31457280
mkdir -p /var/coredump
chmod 700 /var/coredump
sysctl -w kernel.core_pattern=/var/coredump/core.%e.%p.%t
else
ulimit -c 0
fi
set_cann_env() {
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/driver/lib64/common"
source "$CANN_INSTALL_PATH/ascend-toolkit/set_env.sh"
source "$CANN_INSTALL_PATH/nnal/atb/set_env.sh"
}
_motor_deploy_hardware_type() {
local cfg=""
if [ -n "$USER_CONFIG_PATH" ] && [ -f "$USER_CONFIG_PATH" ]; then
cfg="$USER_CONFIG_PATH"
elif [ -n "$USER_CONFIG_FILE" ] && [ -f "$USER_CONFIG_FILE" ]; then
cfg="$USER_CONFIG_FILE"
fi
if [ -z "$cfg" ]; then
echo "Error: Config file missing" >&2
return
fi
python3 -c "import json,sys; d=json.load(open(sys.argv[1],encoding='utf-8')); print(d.get('motor_deploy_config',{}).get('hardware_type',''))" "$cfg" 2>/dev/null || echo ""
}
is_a5_hardware() {
local hw_type
hw_type="$(_motor_deploy_hardware_type)"
case "$hw_type" in
350-Atlas-8|350-Atlas-16|350-Atlas-4p-8|350-Atlas-4p-16|850-Atlas-8p-8|850-SuperPod-Atlas-8|950-SuperPod-Atlas-8)
return 0
;;
*)
return 1
;;
esac
}
set_a5_engine_env() {
local if_name ip
if_name=$(awk '$2 == "00000000" {print $1; exit}' /proc/net/route)
ip="${HOST_IP:-$POD_IP}"
if [ -z "$if_name" ]; then
echo "Warning: failed to detect default route interface from /proc/net/route, skip GLOO/TP/HCCL socket ifname env" >&2
unset GLOO_SOCKET_IFNAME TP_SOCKET_IFNAME HCCL_SOCKET_IFNAME
else
export GLOO_SOCKET_IFNAME="$if_name"
export TP_SOCKET_IFNAME="$if_name"
export HCCL_SOCKET_IFNAME="$if_name"
fi
if [ -z "$ip" ]; then
echo "Warning: HOST_IP and POD_IP are both empty, skip HCCL_IF_IP env" >&2
unset HCCL_IF_IP
else
export HCCL_IF_IP="$ip"
fi
export PATH="$PATH:/usr/local/go/bin"
export LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH:-}"
export ASCEND_LOCAL_COMM_RES_PATH="${ASCEND_LOCAL_COMM_RES_PATH:-/etc/hixlep}"
}
gen_ranktable_config() {
if is_a5_hardware; then
echo "hardware_type is in A5 list, skip gen_ranktable_config (no hccl/ranktable on this platform)"
return
fi
if [ -f "$CONFIGMAP_PATH/hccl_tools.py" ]; then
echo "Using hccl_tools.py to generate ranktable.json..."
export HCCL_PATH="$CONFIG_PATH/hccl.json"
export PATH="/usr/local/Ascend/driver/tools:$PATH"
PYTHONUNBUFFERED=1 python3 "$CONFIGMAP_PATH/hccl_tools.py" --hccl_path "$HCCL_PATH"
export RANKTABLE_PATH="$CONFIG_PATH/ranktable.json"
else
echo "hccl_tools.py does not exist, skip ranktable generation"
fi
}
gen_kv_pool_config() {
if [ -n "$KVP_MASTER_SERVICE" ]; then
echo "Updating kv cache pool configuration file..."
export MOONCAKE_CONFIG_PATH=$CONFIG_PATH/kv_cache_pool_config.json
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH
if [ "$ROLE" = "SINGLE_CONTAINER" ]; then
KVP_MASTER_SERVICE=$POD_IP
fi
python3 "$CONFIGMAP_PATH/mooncake_config.py" pool "$MOONCAKE_CONFIG_PATH" "$USER_CONFIG_PATH"
fi
}
set_mf_store_env() {
if [ -n "$ASCEND_MF_STORE_URL" ]; then
if [[ "$ASCEND_MF_STORE_URL" =~ ^(tcp://)?\[([0-9a-fA-F:]+)\](:([0-9]+))?$ ]]; then
PROTO="${BASH_REMATCH[1]}"
HOST="${BASH_REMATCH[2]}"
PORT="${BASH_REMATCH[4]}"
echo "HOST is already IPv6 literal: $HOST"
elif [[ "$ASCEND_MF_STORE_URL" =~ ^(tcp://)?([^:/]+)(:([0-9]+))?$ ]]; then
PROTO="${BASH_REMATCH[1]}"
HOST="${BASH_REMATCH[2]}"
PORT="${BASH_REMATCH[4]}"
if [[ ! "$HOST" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
MAX_RETRY=5
RETRY_INTERVAL=10
RETRY_COUNT=0
MF_STORE_POD_IP=""
while [ $RETRY_COUNT -lt $MAX_RETRY ]; do
MF_STORE_POD_IP=$(getent hosts "$HOST" | awk '{print $1}' | head -n1)
if [ -n "$MF_STORE_POD_IP" ]; then
break
fi
RETRY_COUNT=$((RETRY_COUNT+1))
echo "resolve $HOST failed, retry $RETRY_COUNT/$MAX_RETRY ..."
sleep $RETRY_INTERVAL
done
if [ -z "$MF_STORE_POD_IP" ]; then
echo "get pod ip error: $HOST"
exit 1
else
echo "$HOST pod ip: $MF_STORE_POD_IP"
if [[ "$MF_STORE_POD_IP" == *:* ]]; then
export ASCEND_MF_STORE_URL="${PROTO}[${MF_STORE_POD_IP}]:${PORT}"
else
export ASCEND_MF_STORE_URL="${PROTO}${MF_STORE_POD_IP}:${PORT}"
fi
fi
else
echo "HOST is already IP: $HOST"
fi
else
echo "ASCEND_MF_STORE_URL format invalid: $ASCEND_MF_STORE_URL"
exit 1
fi
echo "ASCEND_MF_STORE_URL: $ASCEND_MF_STORE_URL"
fi
}