#!/bin/bash
set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
DOCKERFILE="${SCRIPT_DIR}/Dockerfile"

NPU_TYPE="910b"
OS="openeuler24.03"
BASE_IMAGE_VERSION="9.0.0-beta.2"
BASE_IMAGE=""
PYTHON_VERSION="3.11"
TORCH_VERSION="2.7.1"
TORCH_NPU_VERSION="2.7.1"
MINDSPEED_BRANCH="master"
MEGATRON_BRANCH="core_v0.12.1"
IMAGE_NAME=""
NO_CACHE=""
CLEANUP_ON_FAIL=false
NPU_TYPE_EXPLICIT=false
OS_EXPLICIT=false

cleanup_dangling() {
    echo ">>> Cleaning up dangling images and corresponding containers..."
    local dangling_images
    dangling_images=$(docker images -f "dangling=true" -q 2>/dev/null || true)
    if [ -n "$dangling_images" ]; then
        for img_id in $dangling_images; do
            local containers
            containers=$(docker ps -a -q --filter "ancestor=$img_id" 2>/dev/null || true)
            if [ -n "$containers" ]; then
                docker rm -f $containers 2>/dev/null || true
            fi
        done
        docker rmi $dangling_images 2>/dev/null || true
    fi
}

show_help() {
    cat << EOF
Usage: $0 [OPTIONS]

Build MindSpeed Core Docker Image

Options:
    -t, --npu-type TYPE       NPU type: a3 or 910b (default: 910b)
    -o, --os OS               OS: openeuler24.03 or ubuntu22.04 (default: openeuler24.03, auto-detected from --base-image if not specified)
    -i, --image-name NAME     Image name (default: mindspeed-core:{branch}-{chip}-{os}-py{py_ver}-{arch})
    -n, --no-cache            Build without cache
    --base-image-version VER  Base image CANN version (default: 9.0.0-beta.2)
    --base-image IMAGE        Full base image name (higher priority than --base-image-version; passed through unchanged)
    --python-version VER      Python tag in the CANN base image (default: 3.11)
    --torch-version VER       PyTorch version (default: 2.7.1)
    --torch-npu-version VER   torch_npu version (default: 2.7.1)
    --mindspeed-branch REF    MindSpeed branch/tag/ref to clone (default: master)
    --megatron-branch REF     Megatron-LM branch/tag/ref to checkout (default: core_v0.12.1)
    --cleanup-on-fail         Clean dangling images/containers if build fails
    -h, --help                Show help

Examples:
    bash $0
    bash $0 -t a3 -o openeuler24.03 --base-image-version 9.0.0-beta.2
    bash $0 --base-image swr.cn-south-1.myhuaweicloud.com/ascendhub/cann:9.0.0-beta.2-910b-openeuler24.03-py3.11

Note:
    CANN base image tags use lowercase chip names, such as a3 and 910b. A full --base-image value is used exactly as provided.
EOF
}

parse_base_image_tag() {
    local image="$1"
    local tag="${image##*:}"
    local tag_lower
    tag_lower=$(echo "$tag" | tr '[:upper:]' '[:lower:]')

    if [[ "$tag_lower" == *"910b"* ]]; then
        DETECTED_NPU_TYPE="910b"
    elif [[ "$tag_lower" == *"-a3-"* ]] || [[ "$tag_lower" == *"-a3-py"* ]]; then
        DETECTED_NPU_TYPE="a3"
    fi

    if [[ "$tag_lower" == *"openeuler24.03"* ]]; then
        DETECTED_OS="openeuler24.03"
    elif [[ "$tag_lower" == *"ubuntu22.04"* ]]; then
        DETECTED_OS="ubuntu22.04"
    fi

    if [[ "$tag_lower" =~ py([0-9]+\.[0-9]+) ]]; then
        DETECTED_PYTHON_VERSION="${BASH_REMATCH[1]}"
    fi
}

while [[ $# -gt 0 ]]; do
    case $1 in
        -t|--npu-type)        NPU_TYPE="$2"; NPU_TYPE_EXPLICIT=true; shift 2 ;;
        -o|--os)              OS="$2"; OS_EXPLICIT=true; shift 2 ;;
        -i|--image-name)      IMAGE_NAME="$2"; shift 2 ;;
        -n|--no-cache)        NO_CACHE="--no-cache"; shift ;;
        --base-image-version) BASE_IMAGE_VERSION="$2"; shift 2 ;;
        --base-image)         BASE_IMAGE="$2"; shift 2 ;;
        --python-version)     PYTHON_VERSION="$2"; shift 2 ;;
        --torch-version)      TORCH_VERSION="$2"; shift 2 ;;
        --torch-npu-version)  TORCH_NPU_VERSION="$2"; shift 2 ;;
        --mindspeed-branch)   MINDSPEED_BRANCH="$2"; shift 2 ;;
        --megatron-branch)    MEGATRON_BRANCH="$2"; shift 2 ;;
        --cleanup-on-fail)    CLEANUP_ON_FAIL=true; shift ;;
        -h|--help)            show_help; exit 0 ;;
        *)                    echo "Unknown argument: $1"; show_help; exit 1 ;;
    esac
done

if [ ! -f "$DOCKERFILE" ]; then
    echo "Error: Dockerfile not found: $DOCKERFILE"
    exit 1
fi

DETECTED_NPU_TYPE=""
DETECTED_OS=""
DETECTED_PYTHON_VERSION=""
if [ -n "$BASE_IMAGE" ]; then
    parse_base_image_tag "$BASE_IMAGE"
    if [ "$NPU_TYPE_EXPLICIT" = false ] && [ -n "$DETECTED_NPU_TYPE" ]; then
        NPU_TYPE="$DETECTED_NPU_TYPE"
    fi
    if [ "$OS_EXPLICIT" = false ] && [ -n "$DETECTED_OS" ]; then
        OS="$DETECTED_OS"
    fi
    if [ -n "$DETECTED_PYTHON_VERSION" ]; then
        PYTHON_VERSION="$DETECTED_PYTHON_VERSION"
    fi
fi

NPU_TYPE_LOWER=$(echo "$NPU_TYPE" | tr '[:upper:]' '[:lower:]')
OS=$(echo "$OS" | tr '[:upper:]' '[:lower:]')

if [ "$NPU_TYPE_LOWER" != "a3" ] && [ "$NPU_TYPE_LOWER" != "910b" ]; then
    echo "Error: NPU type must be a3 or 910b"
    exit 1
fi

if [ "$OS" != "ubuntu22.04" ] && [ "$OS" != "openeuler24.03" ]; then
    echo "Error: OS must be ubuntu22.04 or openeuler24.03"
    exit 1
fi

case "$OS" in
    ubuntu*) OS_FAMILY="ubuntu"; REPO_SCRIPT="configure_apt_repo.sh" ;;
    openeuler*) OS_FAMILY="openeuler"; REPO_SCRIPT="configure_yum_repo.sh" ;;
esac

HOST_ARCH=$(uname -m)
case "$HOST_ARCH" in
    arm64) ARCH_NAME="aarch64" ;;
    *)     ARCH_NAME="$HOST_ARCH" ;;
esac
if [ -z "$IMAGE_NAME" ]; then
    TAG_REF=$(echo "$MINDSPEED_BRANCH" | tr '/:' '--')
    IMAGE_NAME="mindspeed-core:${TAG_REF}-${NPU_TYPE_LOWER}-${OS}-py${PYTHON_VERSION}-${ARCH_NAME}"
fi

cd "$SCRIPT_DIR"
cp "${SCRIPT_DIR}/${REPO_SCRIPT}" configure_repo.sh
trap 'rm -f configure_repo.sh' EXIT

BUILD_ARGS="--build-arg OS=${OS}"
BUILD_ARGS="$BUILD_ARGS --build-arg OS_FAMILY=${OS_FAMILY}"
BUILD_ARGS="$BUILD_ARGS --build-arg NPU_TYPE=${NPU_TYPE_LOWER}"
BUILD_ARGS="$BUILD_ARGS --build-arg PYTHON_VERSION=${PYTHON_VERSION}"
BUILD_ARGS="$BUILD_ARGS --build-arg TORCH_VERSION=${TORCH_VERSION}"
BUILD_ARGS="$BUILD_ARGS --build-arg TORCH_NPU_VERSION=${TORCH_NPU_VERSION}"
BUILD_ARGS="$BUILD_ARGS --build-arg MINDSPEED_BRANCH=${MINDSPEED_BRANCH}"
BUILD_ARGS="$BUILD_ARGS --build-arg MEGATRON_BRANCH=${MEGATRON_BRANCH}"

if [ -n "$BASE_IMAGE" ]; then
    BUILD_ARGS="$BUILD_ARGS --build-arg BASE_IMAGE=${BASE_IMAGE}"
else
    BUILD_ARGS="$BUILD_ARGS --build-arg BASE_IMAGE_VERSION=${BASE_IMAGE_VERSION}"
fi

echo "=========================================="
echo "Build Configuration"
echo "=========================================="
echo "NPU Type:           ${NPU_TYPE_LOWER}"
echo "OS:                 ${OS}"
echo "OS Family:          ${OS_FAMILY}"
echo "CPU Architecture:   ${ARCH_NAME}"
echo "Dockerfile:         ${DOCKERFILE}"
echo "Image Name:         ${IMAGE_NAME}"
echo "Base Image Version: ${BASE_IMAGE_VERSION}"
if [ -n "$BASE_IMAGE" ]; then
    echo "Base Image:         ${BASE_IMAGE}"
fi
echo "Python Version:     ${PYTHON_VERSION}"
echo "PyTorch Version:    ${TORCH_VERSION}"
echo "torch_npu Version:  ${TORCH_NPU_VERSION}"
echo "MindSpeed Ref:      ${MINDSPEED_BRANCH}"
echo "Megatron-LM Ref:    ${MEGATRON_BRANCH}"
echo "No Cache:           ${NO_CACHE:-No}"
echo "=========================================="

set +e
docker build \
    -t "$IMAGE_NAME" \
    -f "$DOCKERFILE" \
    $BUILD_ARGS \
    $NO_CACHE \
    --network=host \
    .
BUILD_RESULT=$?
set -e

if [ $BUILD_RESULT -eq 0 ]; then
    echo "=========================================="
    echo "Build Complete!"
    echo "Image: ${IMAGE_NAME}"
    echo "=========================================="
    exit 0
fi

echo "=========================================="
echo "Build Failed!"
echo "=========================================="
if [ "$CLEANUP_ON_FAIL" = true ]; then
    cleanup_dangling
fi
exit $BUILD_RESULT