source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
NGPU=${NGPU:-"8"}
export LOG_RANK=${LOG_RANK:-0}
CONFIG_FILE=${CONFIG_FILE:-"./torchtitan_npu/models/deepseek_v32/train_configs/deepseek_v32_671b_debug.toml"}
TRAIN_FILE=${TRAIN_FILE:-"torchtitan_npu.entry"}
COMM_MODE=${COMM_MODE:-""}
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
if [ -n "$COMM_MODE" ]; then
if [[ ! "$COMM_MODE" =~ ^(fake_backend)$ ]]; then
echo "Error: Invalid COMM_MODE. Use 'fake_backend'"
exit 1
fi
echo "Running with comm_mode=${COMM_MODE}"
NGPU="${NGPU}" LOCAL_RANK=0 python3 -m "${TRAIN_FILE}" --job.config_file "${CONFIG_FILE}" "$@" --comm.mode=${COMM_MODE} --training.steps=1
else
PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" \
CUDA_DEVICE_MAX_CONNECTIONS=1 \
CPU_AFFINITY_CONF=1 \
TASK_QUEUE_ENABLE=2 \
HCCL_CONNECT_TIMEOUT=3600 \
STREAMS_PER_DEVICE=32 \
MULTI_STREAM_MEMORY_RESERVE=1 \
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-m ${TRAIN_FILE} --job.config_file ${CONFIG_FILE} "$@"
fi