#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#############################################################################
# Copyright (c) 2020 Huawei Technologies Co.,Ltd.
#
# openGauss is licensed under Mulan PSL v2.
# You can use this software according to the terms
# and conditions of the Mulan PSL v2.
# You may obtain a copy of Mulan PSL v2 at:
#
#          http://license.coscl.org.cn/MulanPSL2
#
# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS,
# WITHOUT WARRANTIES OF ANY KIND,
# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
# See the Mulan PSL v2 for more details.
# ----------------------------------------------------------------------------
# Description  : DbClusterInfo.py is a utility to get cluster information
#############################################################################
# Standard library imports
import argparse
import binascii
import copy
import datetime
import getopt
import getpass
import ipaddress
import json
import logging.handlers as _handlers
import os
import pwd
import re
import shutil
import socket
import stat
import struct
import subprocess
import sys
import tempfile
import threading
import time
import types
import xml.etree.ElementTree as ET
import xml.etree.cElementTree as ETree

sys.path.append(os.path.split(os.path.realpath(__file__))[0] + "/../../")

# Network constants
NET_IPV6 = "ipv6"
NET_IPV4 = "ipv4"
ADDRESS_FAMILY_INDEX = 4
IP_ADDRESS_INDEX = 0

# Environment constants
ENV_CLUSTERCONFIG = "CLUSTERCONFIGFILE"

# Command path constants
CMD_PATH = ['/bin', '/usr/local/bin', '/usr/bin', '/sbin', '/usr/sbin']
CMD_CACHE = {}

# String constants
BLANK_SPACE = " "
COLON = ":"

# Instance role constants
INSTANCE_ROLE_UNDEFINED = -1
INSTANCE_ROLE_CMSERVER = 0
INSTANCE_ROLE_GTM = 1
INSTANCE_ROLE_ETCD = 2
INSTANCE_ROLE_COODINATOR = 3
INSTANCE_ROLE_DATANODE = 4
INSTANCE_ROLE_CMAGENT = 5

# ID number constants
BASE_ID_CMSERVER = 1
BASE_ID_GTM = 1001
BASE_ID_CMAGENT = 10001
BASE_ID_DUMMYDATANODE = 3001
BASE_ID_COORDINATOR = 5001
BASE_ID_DATANODE = 6001
BASE_ID_ETCD = 7001

# Directory permissions
DIRECTORY_PERMISSION = 0o750

# Primary/standby instance ID constants
OLD_LAST_PRIMARYSTANDBY_BASEID_NUM = 7000
NEW_FIRST_PRIMARYSTANDBY_BASEID_NUM = 40000

# Master instance default ports
MASTER_BASEPORT_CMS = 5000
MASTER_BASEPORT_GTM = 6000
MASTER_BASEPORT_CMAGENT = 0  # cm agent has no port, just occupancy index 5
MASTER_BASEPORT_COO = 8000
MASTER_BASEPORT_DATA = 40000
MASTER_BASEPORT_ETCD = 2379
# Standby instance default ports
STANDBY_BASEPORT_CMS = 5500
STANDBY_BASEPORT_GTM = 6500
STANDBY_BASEPORT_CMAGENT = 0  # cm agent has no port, just occupancy index 5
STANDBY_BASEPORT_COO = 8500
STANDBY_BASEPORT_DATA = 45000
STANDBY_BASEPORT_ETCD = 2380

# Instance type constants (only for CN/DN)
INSTANCE_TYPE_UNDEFINED = -1
MASTER_INSTANCE = 0
STANDBY_INSTANCE = 1
DUMMY_STANDBY_INSTANCE = 2
CASCADE_STANDBY = 3

DICT_INSTANCE = {
    MASTER_INSTANCE: "primary",
    STANDBY_INSTANCE: "standby",
    CASCADE_STANDBY: "cascade_standby"
}

# Instance number constants
MIRROR_COUNT_REPLICATION_MAX = 9  # max number of replication for CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY
AZPRIORITY_MAX = 10  # max number of azPriority for CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY
AZPRIORITY_MIN = 1   # min number of azPriority for CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY
PORT_STEP_SIZE = 20  # DB port set step size for CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY

MIRROR_ID_AGENT = -3

# Cluster type constants
CLUSTER_TYPE_SINGLE = "single"
CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY = "single-primary-multi-standby"
CLUSTER_TYPE_SINGLE_INST = "single-inst"

# Default config version constants (used by gs_upgrade)
BIN_CONFIG_VERSION = 2
BIN_CONFIG_VERSION_SINGLE = 101
BIN_CONFIG_VERSION_SINGLE_PRIMARY_MULTI_STANDBY = 201
BIN_CONFIG_VERSION_SINGLE_INST = 301

# Other constants
PAGE_SIZE = 8192
MAX_IP_NUM = 3
CONFIG_IP_NUM = 1

# Length constants
NODE_ID_LEN = 2
INSTANCE_ID_LEN = 8
SPACE_LEN = 1
STATE_LEN = 17
SEPERATOR_LEN = 1
IP_LEN = 16
PORT_LEN = 10

# Directory permissions
KEY_DIRECTORY_MODE = 700

# Network type constants
g_networkType = 0  # The default network type is single plane

# Global variables
global_cls_query_rst = {}  # global param to cache gr_om query instance result
xmlRootNode = None

# Log constants
MAXLOGFILESIZE = 16 * 1024 * 1024
LOG_DEBUG = 1
LOG_INFO = 2
LOG_WARNING = 2.1
LOG_ERROR = 3
LOG_FATAL = 4

class PackageVerificationError(Exception):
    """
    自定义异常:安装包验证失败
    
    用于在包类型不匹配、包损坏或包不存在时抛出详细错误信息
    """
    
    # 错误码定义
    ERR_PKG_NOT_FOUND = 101        # 包文件不存在
    ERR_PKG_EXTRACT_FAILED = 102   # 包解压失败
    ERR_PKG_TYPE_MISMATCH = 103    # 包类型不匹配
    ERR_PKG_INVALID_GR = 104       # 无效的 GR 包
    ERR_PKG_INVALID_CM = 105       # 无效的 CM 包
    ERR_PKG_UNKNOWN_TYPE = 106     # 未知的包类型
    
    def __init__(self, message, error_code=0, pkg_path=None, expected_type=None, detected_type=None):
        self.message = message
        self.error_code = error_code
        self.pkg_path = pkg_path
        self.expected_type = expected_type
        self.detected_type = detected_type
        super().__init__(self.message)
    
    def __str__(self):
        return f"[ERROR {self.error_code}] {self.message}"
    
    def get_user_friendly_message(self):
        """返回用户友好的错误信息"""
        lines = [
            "=" * 60,
            "PACKAGE VERIFICATION FAILED",
            "=" * 60,
            "",
            f"Error Code: {self.error_code}",
            f"Error Message: {self.message}",
        ]
        if self.pkg_path:
            lines.append(f"Package Path: {self.pkg_path}")
        if self.expected_type:
            lines.append(f"Expected Type: {self.expected_type.upper()}")
        if self.detected_type:
            lines.append(f"Detected Type: {self.detected_type.upper()}")
        
        lines.extend([
            "",
            "Suggestions:",
        ])
        
        if self.error_code == self.ERR_PKG_NOT_FOUND:
            lines.append("  - Check if the package file exists")
            lines.append("  - Verify the file path is correct")
        elif self.error_code == self.ERR_PKG_EXTRACT_FAILED:
            lines.append("  - The package may be corrupted, try re-downloading")
            lines.append("  - Check if you have read permission for the file")
        elif self.error_code == self.ERR_PKG_TYPE_MISMATCH:
            lines.append(f"  - You specified --{self.expected_type}pkg but provided a {self.detected_type.upper()} package")
            lines.append(f"  - Use --{self.detected_type}pkg for this package instead")
            lines.append(f"  - Or provide the correct {self.expected_type.upper()} package")
        elif self.error_code in (self.ERR_PKG_INVALID_GR, self.ERR_PKG_INVALID_CM):
            lines.append("  - The package does not contain expected signature files")
            lines.append("  - Ensure you are using the correct oGRecorder installation package")
        
        lines.extend([
            "",
            "=" * 60,
        ])
        
        return "\n".join(lines)


class CMLog:
    """
    Class to handle log file operations.
    
    This class provides functionality for creating, managing, and writing to log files
    with automatic rotation and formatting capabilities.
    """

    def __init__(self, logPath, module, prefix, suffix=".log", expectLevel=LOG_DEBUG, traceId=None):
        """
        Initialize the CMLog instance.
        
        Args:
            logPath (str): Path to the log directory
            module (str): Module name for logging
            prefix (str): Prefix for log file names
            suffix (str): Suffix for log file names (default: ".log")
            expectLevel (int): Expected log level (default: LOG_DEBUG)
            traceId (str): Trace ID for logging (default: None)
        """
        self.logFile = ""
        self.expectLevel = expectLevel
        self.moduleName = module
        self.fp = None
        self.size = 0
        self.suffix = suffix
        self.prefix = prefix
        self.logPath = logPath
        self.pid = os.getpid()
        self.step = 0
        self.lock = threading.Lock()
        self.tmpFile = None
        self.ignoreErr = False
        self.traceId = traceId

        try:
            if not os.path.isdir(logPath):
                print(ErrorCode.GAUSS_502["GAUSS_50211"] % logPath)
                sys.exit(1)
            # check log path
            if not os.path.exists(logPath):
                try:
                    os.makedirs(logPath, 0o700)
                except Exception as e:
                    raise Exception(ErrorCode.GAUSS_502["GAUSS_50208"] %
                                    logPath + " Error:\n%s" % str(e))
            # create new log file
            self.__openLogFile()
        except Exception as ex:
            print(str(ex))
            sys.exit(1)

    def __checkLink(self):
        """
        function: check log file is link
        input : NA
        output: list of
        """
        if os.path.islink(self.logFile):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50206"] % self.logFile)

    def __checkLogFileExist(self):
        """
        check whether log file exists, if exist, get log file name
        log file name format: 
            prefix-YYYY-mm-DD_HHMMSSsuffix = cm_install-YYYY-mm-DD_HHMMSS.log
        """
        logFileList = "%s/logFileList_%s.dat" % (self.logPath, self.pid)
        cmd = "ls %s | grep '^%s-.*%s$' > %s" % (
            self.logPath, self.prefix, self.suffix, logFileList)
        (status, output) = subprocess.getstatusoutput(cmd)
        if status != 0:
            if os.path.exists(logFileList):
                os.remove(logFileList)
            return False
        with open(logFileList, "r") as fp:
            filenameList = []
            while True:
                # get real file name
                filename = (fp.readline()).strip()
                if not filename:
                    break
                existedResList = filename.split(".")
                if len(existedResList) > 2:
                    continue
                (existedPrefix, existedSuffix) = \
                    os.path.splitext(filename)
                if existedSuffix != self.suffix:
                    continue
                if len(filename) != len(self.prefix) + \
                    len(self.suffix) + 18:
                    continue
                timeStamp = existedPrefix[-17:]
                # check log file name
                if self.__isValidDate(timeStamp):
                    filenameList.append(filename)
        # cleanup logFileList
        if os.path.exists(logFileList):
            os.remove(logFileList)

        if len(filenameList) == 0:
            return False
        # get logFile
        fileName = max(filenameList)
        self.logFile = os.path.join(self.logPath, fileName)
        self.__checkLink()
        return True

    def __openLogFile(self):
        """
        function: open log file
        input : NA
        output: NA
        """
        try:
            if self.__checkLogFileExist():
                self.fp = open(self.logFile, "a")
                return
            # get current time
            currentTime = time.strftime("%Y-%m-%d_%H%M%S")
            # init log file
            self.logFile = os.path.join(self.logPath, self.prefix + "-" + currentTime + self.suffix)
            # Re-create the log file to add a retry 3 times mechanism,
            # in order to call concurrently between multiple processes
            retryTimes = 3
            count = 0
            while (True):
                (status, output) = self.__createLogFile()
                if status == 0:
                    break
                count = count + 1
                time.sleep(1)
                if (count > retryTimes):
                    raise Exception(output)
            # open log file
            self.__checkLink()
            self.fp = open(self.logFile, "a")
        except Exception as e:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50206"]
                            % self.logFile + " Error:\n%s" % str(e))

    def __createLogFile(self):
        """
        function: create log file
        input : NA
        output: (status, output)
        """
        try:
            if (not os.path.exists(self.logFile)):
                os.mknod(self.logFile)
            return (0, "")
        except Exception as e:
            return (1, str(e))

    def __isValidDate(self, datastr):
        """
        function: Judge if date valid
        input : datastr
        output: bool
        """
        try:
            time.strptime(datastr, "%Y-%m-%d_%H%M%S")
            return True
        except Exception as ex:
            return False

    def closeLog(self):
        """
        function: Function to close log file
        input : NA
        output: NA
        """
        try:
            if self.fp:
                self.fp.flush()
                self.fp.close()
                self.fp = None
        except Exception as ex:
            if self.fp:
                self.fp.close()
            raise Exception(str(ex))

    # print the flow message to console window and log file
    # AddInfo: constant represent step constant, addStep represent step
    # plus, None represent no step
    def log(self, msg, stepFlag=""):
        """
        function:print the flow message to console window and log file
        input:   msg,stepFlag
        control: when stepFlag="", the OM background log does not display
        step information.
                 when stepFlag="addStep", the OM background log step will
                 add 1.
                 when stepFlag="constant", the OM background log step
                 defaults to the current step.
        output:  NA
        """
        if (LOG_INFO >= self.expectLevel):
            print(msg)
            self.__writeLog("LOG", msg, stepFlag)

    # print the flow message to log file only
    def debug(self, msg, stepFlag=""):
        """
        function:print the flow message to log file only
        input:   msg,stepFlag
        control: when stepFlag="", the OM background log does not display
        step information.
                 when stepFlag="addStep", the OM background log step will
                 add 1.
                 when stepFlag="constant", the OM background log step
                 defaults to the current step.
        output:  NA
        """
        if (LOG_DEBUG >= self.expectLevel):
            self.__writeLog("DEBUG", msg, stepFlag)

    def warn(self, msg, stepFlag=""):
        """
        function:print the flow message to log file only
        input:   msg,stepFlag
        control: when stepFlag="", the OM background log does not display
        step information.
                 when stepFlag="addStep", the OM background log step will
                 add 1.
                 when stepFlag="constant", the OM background log step
                 defaults to the current step.
        output:  NA
        """
        if (LOG_WARNING >= self.expectLevel):
            print(msg)
            self.__writeLog("WARNING", msg, stepFlag)

    # print the error message to console window and log file
    def error(self, msg):
        """
        function: print the error message to console window and log file
        input : msg
        output: NA
        """
        if (LOG_ERROR >= self.expectLevel):
            print(msg)
            self.__writeLog("ERROR", msg)

    # print the error message to console window and log file,then exit
    def logExit(self, msg):
        """
        function: print the error message to console window and log file,
        then exit
        input : msg
        output: NA
        """
        if (LOG_FATAL >= self.expectLevel):
            print(msg)
            try:
                self.__writeLog("ERROR", msg)
            except Exception as ex:
                print(str(ex))
        self.closeLog()
        sys.exit(1)

    def Step(self, stepFlag):
        """
        function: return Step number info
        input: add
        output: step number
        """
        if (stepFlag == "constant"):
            return self.step
        else:
            self.step = self.step + 1
            return self.step

    def __getLogFileLine(self):
        f = sys._getframe().f_back.f_back.f_back
        return "%s(%s:%s)" % (os.path.basename(f.f_code.co_filename), f.f_code.co_name,
                              str(f.f_lineno))

    def __writeLog(self, level, msg, stepFlag=""):
        """
        function: Write log to file
        input: level, msg, stepFlag
        output: NA
        """
        if self.fp is None:
            return

        try:
            self.lock.acquire()
            # if the log file does not exits, create it
            if (not os.path.exists(self.logFile)):
                self.__openLogFile()
            else:
                logPer = oct(os.stat(self.logFile).st_mode)[-3:]
                self.__checkLink()
                if not logPer == "600":
                    os.chmod(self.logFile, 0o600)
            # check if need switch to an new log file
            self.size = os.path.getsize(self.logFile)
            if self.size >= MAXLOGFILESIZE and os.getuid() != 0:
                self.closeLog()
                self.__openLogFile()

            replaceReg = re.compile(r'-W[ ]*[^ ]*[ ]*')
            msg = replaceReg.sub('-W *** ', str(msg))

            if msg.find("gs_redis") >= 0:
                replaceReg = re.compile(r'-A[ ]*[^ ]*[ ]*')
                msg = replaceReg.sub('-A *** ', str(msg))

            strTime = datetime.datetime.now()
            fileLine = self.__getLogFileLine()
            if stepFlag == "":
                if self.traceId:
                    print("[%s][%s][%d][%s][%s]:%s"
                          % (self.traceId, strTime, self.pid, self.moduleName,
                             level, msg), file=self.fp)
                else:
                    print("[%s][%d][%s][%s]:%s" % (
                        strTime, self.pid, self.moduleName, level, msg),
                        file=self.fp)
            else:
                stepnum = self.Step(stepFlag)
                print("[%s][%d][%s][%s][%s][Step%d]:%s" % (
                    strTime, self.pid, fileLine, self.moduleName, level, stepnum, msg),
                      file=self.fp)
            self.fp.flush()
            self.lock.release()
        except Exception as ex:
            self.lock.release()
            if self.ignoreErr:
                return
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"]
                            % (("log file %s") % self.logFile) +
                            " Error:\n%s" % str(ex))

    @staticmethod
    def exitWithError(msg, status=1):
        """
        function: Exit with error message
        input: msg, status=1
        output: NA
        """
        sys.stderr.write("%s\n" % msg)
        sys.exit(status)

    @staticmethod
    def printMessage(msg):
        """
        function: Print the String message
        input: msg
        output: NA
        """
        sys.stdout.write("%s\n" % msg)

class peerInstanceInfo():
    """
    Peer instance information
    """

    def __init__(self):
        self.peerDataPath = ""
        self.peerHAIPs = []
        self.peerHAPort = 0
        self.peerRole = 0
        self.peer2DataPath = ""
        self.peer2HAIPs = []
        self.peer2HAPort = 0
        self.peer2Role = 0

    def __str__(self):
        """
        Construct a printable string representation of a instanceInfo
        """
        ret = "peerDataPath=%s,peerHAPort=%d,peerRole=%d" % (
            self.peerDataPath, self.peerHAPort, self.peerRole)
        if self.peer2DataPath:
            ret += ",peer2DataPath=%s" % self.peer2DataPath
        if self.peer2HAPort:
            ret += ",peer2HAPort=%d" % self.peer2HAPort
        if self.peer2Role:
            ret += ",peer2Role=%d" % self.peer2Role
        return ret

class instanceInfo():
    """
    Instance information
    """

    def __init__(self, instId=0, mirrorId=0):
        """
        Constructor
        """
        # instance id
        self.instanceId = instId
        self.mirrorId = mirrorId
        # host name
        self.hostname = ""
        # listen ip
        self.listenIps = []
        # ha ip
        self.haIps = []
        # float ip
        self.float_ips = []
        # port
        self.port = 0
        # It's pool port for coordinator, and ha port for other instance
        self.haPort = 0
        # data directory
        self.datadir = ""
        # xlog directory
        self.xlogdir = ""
        # ssd data directory
        self.ssdDir = ""
        # instance type
        self.instanceType = INSTANCE_TYPE_UNDEFINED
        # instance role
        self.instanceRole = INSTANCE_ROLE_UNDEFINED
        # instance rack info
        self.rack = ""
        # oltp zpaxos sub instance type
        self.subInstanceType = INSTANCE_ROLE_UNDEFINED

        self.level = 1
        # we use port and haPort to save peerPort/clientPort for etcd
        # datanode: use haPort to save replport
        # repl port
        self.replport = 0
        # sctp port
        self.sctpPort = 0
        # control port
        self.controlPort = 0
        # az name
        self.azName = ""
        self.azPriority = 0
        self.clusterName = ""
        # peer port etcd
        self.peerPort = 0
        # client port etcd
        self.clientPort = 0
        # instance name
        self.name = ""
        # DB state Normal or other, use to save dynamic info
        self.state = ""
        # get staticConnections from database,use to save dynamic info
        self.staticConnections = ""
        # DB role such as Primary, Standby
        self.localRole = ""
        self.peerInstanceInfos = []
        self.syncNum = -1
        self.syncNumFirst = ""
        self.cascadeRole = "off"
        # dcf_data_path
        self.dcf_data_path = ""

    def __cmp__(self, target):
        """
        Type compare
        """
        if (type(self) != type(target)):
            return 1
        if (not isinstance(target, instanceInfo)):
            return 1
        if (not hasattr(target, "instanceId")):
            return 1
        else:
            return self.instanceId - target.instanceId

    def __str__(self):
        """
        Construct a printable string representation of a instanceInfo
        """
        ret = "InstanceId=%s,MirrorId=%s,Host=%s,Port=%s,DataDir=%s," \
              "XlogDir=%s,SsdDir=%s,InstanceType=%s,Role=%s,ListenIps=%s," \
              "HaIps=%s" % (
                  self.instanceId, self.mirrorId, self.hostname, self.port,
                  self.datadir, self.xlogdir, self.ssdDir, self.instanceType,
                  self.instanceRole, self.listenIps, self.haIps)
        if self.rack:
            ret += ",rack=%s" % self.rack
        if self.replport:
            ret += ",replport=%s" % self.replport
        if self.sctpPort:
            ret += ",sctpPort=%s" % self.sctpPort
        if self.controlPort:
            ret += ",controlPort=%s" % self.controlPort
        if self.azName:
            ret += ",azName=%s" % self.azName
        if hasattr(self, 'azPriority') and self.azPriority > 0:
            ret += ",azPriority=%s" % self.azPriority
        if self.clusterName:
            ret += ",clusterName=%s" % self.clusterName
        if self.peerPort:
            ret += ",peerPort=%s" % self.peerPort
        if self.clientPort:
            ret += ",clientPort=%s" % self.clientPort
        if self.name:
            ret += ",name=%s" % self.name
        return ret

class dbNodeInfo():
    """
    Instance info on a node
    """

    def __init__(self, nodeId=0, name=""):
        """
        Constructor
        """
        # node id
        self.id = nodeId
        # node name
        self.name = name
        self.backIps = []
        self.virtualIp = []
        self.sshIps = []
        # instance number
        self.cmsNum = 0
        self.cooNum = 0
        self.dataNum = 0
        self.gtmNum = 0
        self.etcdNum = 0
        # cm_servers instance
        self.cmservers = []
        # cn instance
        self.coordinators = []
        # DB instance
        self.datanodes = []
        # gtm instance
        self.gtms = []
        # cm_agent instance
        self.cmagents = []
        # etcd instance
        self.etcds = []
        # cm_server/cm_agent data directory
        self.cmDataDir = ""
        self.dummyStandbyBasePort = 0
        self.masterBasePorts = [MASTER_BASEPORT_CMS, MASTER_BASEPORT_GTM,
                                MASTER_BASEPORT_COO,
                                MASTER_BASEPORT_DATA, MASTER_BASEPORT_ETCD,
                                MASTER_BASEPORT_CMAGENT]
        self.standbyBasePorts = [STANDBY_BASEPORT_CMS, STANDBY_BASEPORT_GTM,
                                 STANDBY_BASEPORT_COO,
                                 STANDBY_BASEPORT_DATA, STANDBY_BASEPORT_ETCD,
                                 STANDBY_BASEPORT_CMAGENT]
        # azName
        self.azName = ""
        self.azPriority = 1
        self.standbyDnNum = 0
        self.dummyStandbyDnNum = 0
        self.cascadeRole = "off"
        self.ssh_port = 0
        # gr
        self.grIp1 = ""
        self.listen_addr = ""
        self.listen_port = ""

    def __cmp__(self, target):
        """
        Type compare
        """
        if (type(self) != type(target)):
            return 1
        if (not isinstance(target, dbNodeInfo)):
            return 1
        if (not hasattr(target, "id")):
            return 1
        else:
            return self.id - target.id

    def __str__(self):
        """
        function : Construct a printable string representation of a dbNodeInfo
        input : NA
        output : String
        """
        retStr = "HostName=%s,backIps=%s" % (self.name, self.backIps)
        # cm_server instance information
        for cmsInst in self.cmservers:
            retStr += "\n%s" % str(cmsInst)
        # cm_agent instance information
        for cmaInst in self.cmagents:
            retStr += "\n%s" % str(cmaInst)
        # gtm instance information
        for gtmInst in self.gtms:
            retStr += "\n%s" % str(gtmInst)
        # cn instance information
        for cooInst in self.coordinators:
            retStr += "\n%s" % str(cooInst)
        # DB instance information
        for dataInst in self.datanodes:
            retStr += "\n%s" % str(dataInst)
        # etcd instance information
        for dataInst in self.etcds:
            retStr += "\n%s" % str(dataInst)

        return retStr

    def setDnDetailNum(self):
        self.dataNum = self.getDnNum(MASTER_INSTANCE)
        self.standbyDnNum = self.getDnNum(STANDBY_INSTANCE)
        self.dummyStandbyDnNum = self.getDnNum(DUMMY_STANDBY_INSTANCE)

    def getDnNum(self, dntype):
        """
        function: get DB num
        input: dntype
        output:dn num
        """
        count = 0
        for dnInst in self.datanodes:
            if (dnInst.instanceType == dntype):
                count += 1
        return count

    def appendInstance(self, instId, mirrorId, instRole, instanceType,
                       listenIps=None, haIps=None, datadir="", ssddir="", level=1,
                       xlogdir="", syncNum=-1, syncNumFirst="", dcf_data="", float_ips=None):
        """
        function : Classify the instance of cmserver/gtm
        input : int,int,String,String
        output : NA
        """
        if not self.__checkDataDir(datadir, instRole):
            raise Exception(ErrorCode.GAUSS_516["GAUSS_51638"] % \
                            self.name + " Data directory[%s] is "
                                        "conflicting." % datadir)

        dbInst = instanceInfo(instId, mirrorId)
        dbInst.hostname = self.name
        dbInst.datadir = os.path.realpath(datadir)

        if (instRole == INSTANCE_ROLE_DATANODE):
            dbInst.xlogdir = xlogdir
        else:
            dbInst.xlogdir = ""
        dbInst.instanceType = instanceType
        dbInst.instanceRole = instRole
        if (listenIps is not None):
            if (len(listenIps) == 0):
                dbInst.listenIps = self.backIps[:]
            else:
                dbInst.listenIps = listenIps[:]

        if float_ips is not None:
            if len(float_ips) != 0:
                dbInst.float_ips = float_ips

        if (haIps is not None):
            if (len(haIps) == 0):
                dbInst.haIps = self.backIps[:]
            else:
                dbInst.haIps = haIps[:]
        # cm_server
        if (instRole == INSTANCE_ROLE_CMSERVER):
            dbInst.datadir = os.path.join(self.cmDataDir, "cm_server")
            dbInst.port = self.__assignNewInstancePort(self.cmservers,
                                                       instRole, instanceType)
            dbInst.level = level
            dbInst.haPort = dbInst.port + 1
            self.cmservers.append(dbInst)
        # gtm
        elif (instRole == INSTANCE_ROLE_GTM):
            dbInst.port = self.__assignNewInstancePort(self.gtms, instRole,
                                                       instanceType)
            dbInst.haPort = dbInst.port + 1
            self.gtms.append(dbInst)
        # dn
        elif (instRole == INSTANCE_ROLE_DATANODE):
            dbInst.port = self.__assignNewInstancePort(self.datanodes,
                                                       instRole, instanceType)
            dbInst.haPort = dbInst.port + 1
            dbInst.ssdDir = ssddir
            dbInst.syncNum = syncNum
            dbInst.syncNumFirst = syncNumFirst
            dbInst.dcf_data_path = dcf_data
            self.datanodes.append(dbInst)
        # cm_agent
        elif (instRole == INSTANCE_ROLE_CMAGENT):
            dbInst.datadir = os.path.join(self.cmDataDir, "cm_agent")
            self.cmagents.append(dbInst)
        # etcd
        elif (instRole == INSTANCE_ROLE_ETCD):
            dbInst.port = self.__assignNewInstancePort(self.etcds, instRole,
                                                       instanceType)
            dbInst.haPort = self.__assignNewInstancePort(self.etcds, instRole,
                                                         STANDBY_INSTANCE)
            self.etcds.append(dbInst)

    def __checkDataDir(self, datadir, instRole):
        """
        function : Check whether the instance path is the same as with the
        parameter of datadir
        input : String,String
        output : boolean
        """
        if (datadir == ""):
            return (
                    instRole == INSTANCE_ROLE_CMSERVER or instRole ==
                    INSTANCE_ROLE_CMAGENT)
        checkPathVaild(datadir)
        # cm_server
        for cmsInst in self.cmservers:
            if (cmsInst.datadir == datadir):
                return False
        # cn
        for cooInst in self.coordinators:
            if (cooInst.datadir == datadir):
                return False
        # dn
        for dataInst in self.datanodes:
            if (dataInst.datadir == datadir):
                return False
        # gtm
        for gtmInst in self.gtms:
            if (gtmInst.datadir == datadir):
                return False
        # etcd
        for etcd in self.etcds:
            if (etcd.datadir == datadir):
                return False
        # cm_agent
        for cmaInst in self.cmagents:
            if (cmaInst.datadir == datadir):
                return False

        return True

    def __assignNewInstancePort(self, instList, instRole, instanceType):
        """
        function : Assign a new port for the instance
        input : [],String ,String
        output : int 
        """
        port = 0
        # master instance
        if instanceType == MASTER_INSTANCE:
            port = self.masterBasePorts[instRole]
        # standby instance
        elif instanceType == STANDBY_INSTANCE:
            port = self.standbyBasePorts[instRole]
        # DB dummy standby instance
        elif instanceType == DUMMY_STANDBY_INSTANCE:
            port = self.dummyStandbyBasePort
        # cn and cm_agent instance
        elif instanceType == INSTANCE_TYPE_UNDEFINED:
            port = self.masterBasePorts[instRole]
            return port
        for inst in instList:
            if (inst.instanceType == instanceType):
                port += 2

        return port
class dbClusterInfo():
    """
    Cluster info
    """

    def __init__(self, checkSctpPort=False):
        """
        Constructor
        """
        self.name = ""
        self.appPath = ""
        self.appSoftPath = ""
        self.logPath = ""
        self.xmlFile = ""
        self.dbNodes = []
        self.newNodes = []
        self.cmsFloatIp = ""
        self.__newInstanceId = [BASE_ID_CMSERVER, BASE_ID_GTM, BASE_ID_ETCD,
                                BASE_ID_COORDINATOR, BASE_ID_DATANODE,
                                BASE_ID_CMAGENT]
        self.__newDummyStandbyId = BASE_ID_DUMMYDATANODE
        self.__newMirrorId = 0
        self.clusterRings = []
        self.clusterType = CLUSTER_TYPE_SINGLE_INST
        self.checkSctpPort = checkSctpPort
        self.clusterName = ""
        self.toolPath = ""
        self.agentPath = ""
        self.agentLogPath = ""
        self.tmpPath = ""
        self.managerPath = ""
        self.replicaNum = 0
        self.float_ips = {}
        self.ips_type = []
        self.cluster_back_ip1s = []
        self.node_num = 0

        # add azName
        self.azName = ""
        self.cascadeRole = "off"

        self.version = 0
        self.installTime = 0
        self.localNodeId = 0
        self.nodeCount = 0
        # cluster properties
        self.replicationCount = 0
        self.quorumMode = ""
        self.gtmcount = 0
        self.etcdcount = 0
        self.cmscount = 0
        self.__newGroupId = 0
        self.cncount = 0
        self.masterDnCount = 0
        self.standbyDnCount = 0
        self.dummyStandbyDnCount = 0
        self.cm_state_list = list()
        # add for dcf
        self.enable_dcf = ""
        self.dcf_config = ""
        # oGRecorder
        self.gr_nodes_list = ""
        self.grPath = ""
        self.caPath = ""
        self.wormPath = ""
        self.installPath = ""
        # oGRecorder rest
        self.rest_nodes_list = ""

    def __str__(self):
        """
        function : Construct a printable string representation of a
        dbClusterInfo
        input : NA
        output : String
        """
        retStr = "ClusterName=%s,AppPath=%s,LogPath=%s,ClusterType=%s" % \
                 (self.name, self.appPath, self.logPath, self.clusterType)

        for dbNode in self.dbNodes:
            retStr += "\n%s" % str(dbNode)

        return retStr

    def initLogger(self, mode):
        logPath = os.path.join(self.logPath, "om", "gr_om")
        if not os.path.exists(logPath):
            os.makedirs(logPath)
        self.logger = CMLog(logPath, "gr_om", mode)

    def check_conf_cm_state(self):
        """
        Save CM instance state
        """
        if not self.cm_state_list:
            return True
        state_result = self.cm_state_list[0]
        for state in self.cm_state_list[1:]:
            state_result ^= state
            if state_result:
                return False
        return True

    def __getDnRole(self, instanceType):
        """
        function : Get DnRole by instanceType
        input : Int
        output : String
        """
        if instanceType == MASTER_INSTANCE:
            return "P"
        elif instanceType == STANDBY_INSTANCE:
            return "S"
        elif instanceType == CASCADE_STANDBY:
            return "C"
        elif instanceType == DUMMY_STANDBY_INSTANCE:
            return "R"
        else:
            return ""

    def __getDnInstanceNum(self):
        dnInsNum = 0
        for dbNode in self.dbNodes:
            dnInsNum += len(dbNode.datanodes)
        return dnInsNum

    def __fprintContent(self, content, fileName):
        if fileName != "":
            createFileInSafeMode(fileName)
            with open(fileName, "a") as fp:
                fp.write(content)
                fp.flush()
        sys.stdout.write(content)

    def __checkOsUser(self, user):
        """
        function : Check os user
        input : String
        output : NA
        """
        try:
            user = pwd.getpwnam(user).pw_gid
        except Exception as e:
            raise Exception(ErrorCode.GAUSS_503["GAUSS_50300"] % user)

    def __getStaticConfigFilePath(self, user, ignore_err=False):
        """
        function : get the path of static configuration file. 
        input : String
        output : String
        """
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if (gaussHome == ""):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]" %
                             user))

        checkPathVaild(gaussHome)
        # if under upgrade, and use chose strategy, we may get a wrong path,
        # so we will use the realpath of gausshome
        commitid = VersionInfo.getCommitid()
        appPath = gaussHome + "_" + commitid
        staticConfigFile = "%s/bin/cluster_static_config" % appPath
        staticConfigBak = "%s/bin/cluster_static_config_bak" % appPath
        staticConfig = "%s/bin/cluster_static_config" % os.path.realpath(
            gaussHome)
        if os.path.exists(staticConfig):
            return staticConfig
        elif (os.path.exists(staticConfigFile)):
            return staticConfigFile
        elif (os.path.exists(staticConfigBak)):
            return staticConfigBak
        elif ignore_err:
            return ''
        else:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("static configuration file [%s] of "
                             "designated user [%s]" % (staticConfig, user)))

    def get_staic_conf_path(self, user, ignore_err=False):
        return self.__getStaticConfigFilePath(user=user, ignore_err=ignore_err)


    def __getEnvironmentParameterValue(self, environmentParameterName, user):
        """
        function :Get the environment parameter. 
        !!!!Do not call this function in preinstall.py script.
        because we determine if we are using env separate version by the
        value of MPPDB_ENV_SEPARATE_PATH
        input : String,String 
        output : String
        """
        # get mpprc file
        mpprcFile = getEnvironmentParameterValue('MPPDB_ENV_SEPARATE_PATH', user)
        if mpprcFile is not None and mpprcFile != "":
            mpprcFile = mpprcFile.replace("\\", "\\\\").replace('"', '\\"\\"')
            checkPathVaild(mpprcFile)
            userProfile = mpprcFile
        else:
            userProfile = "~/.bashrc"
        # build shell command
        if (os.getuid() == 0):
            cmd = "su - %s -c 'source %s;echo $%s' 2>/dev/null" % (
                user, userProfile, environmentParameterName)
        else:
            cmd = "source %s;echo $%s 2>/dev/null" % (userProfile,
                                                      environmentParameterName)
        (status, output) = subprocess.getstatusoutput(cmd)
        if (status != 0):
            raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"]
                            % cmd + " Error: \n%s" % output)
        env_path = output.split("\n")[0]
        checkPathVaild(env_path)
        return env_path

    def __getStatusByOM(self, user):
        """
        function :Get the environment parameter.
        !!!!Do not call this function in preinstall.py script.
        because we determine if we are using env separate version by the
        value of MPPDB_ENV_SEPARATE_PATH
        input : String,String
        output : String
        """
        # get mpprc file
        mpprcFile = EnvUtil.getEnvironmentParameterValue('MPPDB_ENV_SEPARATE_PATH', user)
        if mpprcFile is not None and mpprcFile != "":
            mpprcFile = mpprcFile.replace("\\", "\\\\").replace('"', '\\"\\"')
            checkPathVaild(mpprcFile)
            userProfile = mpprcFile
        else:
            userProfile = ClusterConstants.BASHRC
        # build shell command
        if os.getuid() == 0:
            cmd = "su - %s -c 'source %s;gr_om -t status --detail" % (
                user, userProfile)
        else:
            cmd = "source %s;gr_om -t status --detail" % (userProfile)
        (status, output) = subprocess.getstatusoutput(cmd)
        if status != 0:
            raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"]
                            % cmd + " Error: \n%s" % output)
        return [i for i in output.strip().split("\n") if i]

    def __readStaticConfigFile(self, staticConfigFile, user, isLCCluster=False,
                               ignoreLocalEnv=False):
        """
        function : read cluster information from static configuration file
        input : String,String
        output : NA
        """
        fp = None
        try:
            # get env parameter
            gauss_env = self.__getEnvironmentParameterValue("GAUSS_ENV", user)
            self.name = self.__getEnvironmentParameterValue("GS_CLUSTER_NAME",
                                                            user)
            self.appPath = self.__getEnvironmentParameterValue("GAUSSHOME",
                                                               user)
            logPathWithUser = self.__getEnvironmentParameterValue("GAUSSLOG",
                                                                  user)

            if not ignoreLocalEnv:
                if gauss_env == "2" and self.name == "":
                    raise Exception(ErrorCode.GAUSS_503["GAUSS_50300"]
                                    % ("cluster name of designated user"
                                       " [%s]" % user))
                if self.appPath == "":
                    raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                                    ("installation path of designated user "
                                     "[%s]" % user))
                if logPathWithUser == "":
                    raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                                    ("log path of designated user [%s]" %
                                     user))

            splitMark = "/%s" % user
            # set log path without user
            # find the path from right to left
            self.logPath = logPathWithUser[
                           0:(logPathWithUser.rfind(splitMark))]
            staticConfigFilePath = os.path.split(staticConfigFile)[0]
            versionFile = os.path.join(
                staticConfigFilePath, "upgrade_version")
            version, number, commitid = VersionInfo.get_version_info(
                versionFile)
            try:
                # read static_config_file
                fp = open(staticConfigFile, "rb")
                if float(number) <= 92.200:
                    info = fp.read(32)
                    (crc, lenth, version, currenttime, nodeNum,
                     localNodeId) = struct.unpack("=qIIqiI", info)
                else:
                    info = fp.read(28)
                    (crc, lenth, version, currenttime, nodeNum,
                     localNodeId) = struct.unpack("=IIIqiI", info)
                self.version = version
                self.installTime = currenttime
                self.localNodeId = localNodeId
                self.nodeCount = nodeNum
            except Exception as e:
                if fp:
                    fp.close()
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                                % staticConfigFile + " Error:\n" + str(e))
            if version <= 100:
                raise Exception(ErrorCode.GAUSS_516["GAUSS_51637"] % \
                                ("cluster static config version[%s]" % version,
                                 "the new version[%s]" % BIN_CONFIG_VERSION))
            elif version >= 101 and version <= 200:
                self.clusterType = CLUSTER_TYPE_SINGLE
                if BIN_CONFIG_VERSION_SINGLE != version:
                    raise Exception(ErrorCode.GAUSS_516["GAUSS_51637"] % \
                                    ("cluster static config version[%s]"
                                     % version, "the new version[%s]"
                                     % BIN_CONFIG_VERSION_SINGLE))
            elif version >= 201 and version <= 300:
                # single primary multi standy
                self.clusterType = CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY
                if (BIN_CONFIG_VERSION_SINGLE_PRIMARY_MULTI_STANDBY
                        != version):
                    raise Exception(
                        ErrorCode.GAUSS_516["GAUSS_51637"]
                        % ("cluster static config version[%s]" % version,
                           "the new version[%s]"
                           % BIN_CONFIG_VERSION_SINGLE_PRIMARY_MULTI_STANDBY))
            elif version >= 301 and version <= 400:
                # single inst
                self.clusterType = CLUSTER_TYPE_SINGLE_INST
                if BIN_CONFIG_VERSION_SINGLE_INST != version:
                    raise Exception(ErrorCode.GAUSS_516["GAUSS_51637"] % \
                                    ("cluster static config version[%s]"
                                     % version, "the new version[%s]"
                                     % BIN_CONFIG_VERSION_SINGLE_INST))

            self.dbNodes = []
            try:
                for i in range(nodeNum):
                    offset = (fp.tell() // PAGE_SIZE + 1) * PAGE_SIZE
                    fp.seek(offset)
                    dbNode = self.__unPackNodeInfo(fp, number, isLCCluster)
                    self.dbNodes.append(dbNode)
                fp.close()
            except Exception as e:
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                                staticConfigFile + " Error:\nThe content is "
                                                   "not correct.")
        except Exception as e:
            if (fp):
                fp.close()
            raise Exception(str(e))

    def __unPackNodeInfo(self, fp, number, isLCCluster=False):
        """
        function : unpack a node config info
        input : file
        output : Object
        """
        if float(number) <= 92.200:
            info = fp.read(76)
            (crc, nodeId, nodeName) = struct.unpack("=qI64s", info)
        else:
            info = fp.read(72)
            (crc, nodeId, nodeName) = struct.unpack("=II64s", info)
        nodeName = nodeName.decode().strip('\x00')
        dbNode = dbNodeInfo(nodeId, nodeName)
        info = fp.read(68)
        (azName, azPriority) = struct.unpack("=64sI", info)
        dbNode.azName = azName.decode().strip('\x00')
        dbNode.azPriority = azPriority

        # get backIps
        self.__unPackIps(fp, dbNode.backIps)
        # get sshIps
        self.__unPackIps(fp, dbNode.sshIps)
        if (not isLCCluster):
            # get cm_server information
            self.__unPackCmsInfo(fp, dbNode)
            # get cm_agent information
            self.__unpackAgentInfo(fp, dbNode)
            # get gtm information
            self.__unpackGtmInfo(fp, dbNode)
            info = fp.read(404)
        # get DB information
        self.__unpackDataNode(fp, dbNode)
        if (not isLCCluster):
            # get etcd information
            self.__unpackEtcdInfo(fp, dbNode)
            info = fp.read(8)
        # set DB azName for OLAP
        for inst in dbNode.datanodes:
            inst.azName = dbNode.azName
            inst.azPriority = dbNode.azPriority

        return dbNode

    def __unpackEtcdInfo(self, fp, dbNode):
        """
        function : unpack the info of etcd
        input : file,Object
        output : NA
        """
        etcdInst = instanceInfo()
        etcdInst.instanceRole = INSTANCE_ROLE_ETCD
        etcdInst.hostname = dbNode.name
        etcdInst.instanceType = INSTANCE_TYPE_UNDEFINED
        info = fp.read(1100)
        (etcdNum, etcdInst.instanceId, etcdInst.mirrorId, etcdhostname,
         etcdInst.datadir) = struct.unpack("=IIi64s1024s", info)
        etcdInst.datadir = etcdInst.datadir.decode().strip('\x00')
        self.__unPackIps(fp, etcdInst.listenIps)
        info = fp.read(4)
        (etcdInst.port,) = struct.unpack("=I", info)
        self.__unPackIps(fp, etcdInst.haIps)
        info = fp.read(4)
        (etcdInst.haPort,) = struct.unpack("=I", info)
        if (etcdNum == 1):
            dbNode.etcdNum = 1
            dbNode.etcds.append(etcdInst)
            self.etcdcount += 1
        else:
            dbNode.etcdNum = 0
            dbNode.etcds = []

    def __unPackIps(self, fp, ips):
        """
        function : Unpack the info of ips
        input : file,[]
        output : NA
        """
        info = fp.read(4)
        (n,) = struct.unpack("=i", info)
        for i in range(int(n)):
            info = fp.read(128)
            (currentIp,) = struct.unpack("=128s", info)
            currentIp = currentIp.decode().strip('\x00')
            ips.append(str(currentIp.strip()))
        info = fp.read(128 * (MAX_IP_NUM - n))

    def __unPackCmsInfo(self, fp, dbNode):
        """
        function : Unpack the info of CMserver
        input : file Object
        output : NA
        """
        cmsInst = instanceInfo()
        cmsInst.instanceRole = INSTANCE_ROLE_CMSERVER
        cmsInst.hostname = dbNode.name
        info = fp.read(1164)
        (cmsInst.instanceId, cmsInst.mirrorId, dbNode.cmDataDir, cmsInst.level,
         self.cmsFloatIp) = struct.unpack("=II1024sI128s", info)
        dbNode.cmDataDir = dbNode.cmDataDir.decode().strip('\x00')
        self.cmsFloatIp = self.cmsFloatIp.decode().strip('\x00')
        cmsInst.datadir = "%s/cm_server" % dbNode.cmDataDir
        self.__unPackIps(fp, cmsInst.listenIps)
        info = fp.read(4)
        (cmsInst.port,) = struct.unpack("=I", info)
        self.__unPackIps(fp, cmsInst.haIps)
        info = fp.read(8)
        (cmsInst.haPort, cmsInst.instanceType) = struct.unpack("=II", info)
        if (cmsInst.instanceType == MASTER_INSTANCE):
            dbNode.cmsNum = 1
        elif (cmsInst.instanceType == STANDBY_INSTANCE):
            dbNode.cmsNum = 0
        else:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51204"]
                            % ("CMServer", cmsInst.instanceType))
        info = fp.read(4 + 128 * MAX_IP_NUM + 4)

        if (cmsInst.instanceId):
            dbNode.cmservers.append(cmsInst)
            self.cmscount += 1
        else:
            dbNode.cmservers = []

    def __unpackAgentInfo(self, fp, dbNode):
        """
        function : Unpack the info of agent. It should be called after
        __unPackCmsInfo, because dbNode.cmDataDir
                   get value in __unPackCmsInfo
        input : file Object
        output : NA
        """
        cmaInst = instanceInfo()
        cmaInst.instanceRole = INSTANCE_ROLE_CMAGENT
        cmaInst.hostname = dbNode.name
        cmaInst.instanceType = INSTANCE_TYPE_UNDEFINED
        info = fp.read(8)
        (cmaInst.instanceId, cmaInst.mirrorId) = struct.unpack("=Ii", info)
        self.__unPackIps(fp, cmaInst.listenIps)
        cmaInst.datadir = "%s/cm_agent" % dbNode.cmDataDir
        dbNode.cmagents.append(cmaInst)

    def __unpackGtmInfo(self, fp, dbNode):
        """      
        function : Unpack the info of gtm
        input : file Object
        output : NA
        """
        gtmInst = instanceInfo()
        gtmInst.instanceRole = INSTANCE_ROLE_GTM
        gtmInst.hostname = dbNode.name
        info = fp.read(1036)
        (gtmInst.instanceId, gtmInst.mirrorId, gtmNum,
         gtmInst.datadir) = struct.unpack("=III1024s", info)
        gtmInst.datadir = gtmInst.datadir.decode().strip('\x00')
        self.__unPackIps(fp, gtmInst.listenIps)
        info = fp.read(8)
        (gtmInst.port, gtmInst.instanceType) = struct.unpack("=II", info)
        if (gtmInst.instanceType == MASTER_INSTANCE):
            dbNode.gtmNum = 1
        elif (gtmInst.instanceType == STANDBY_INSTANCE):
            dbNode.gtmNum = 0
        else:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51204"] % (
                "GTM", gtmInst.instanceType))
        self.__unPackIps(fp, gtmInst.haIps)
        info = fp.read(4)
        (gtmInst.haPort,) = struct.unpack("=I", info)
        info = fp.read(1024 + 4 + 128 * MAX_IP_NUM + 4)

        if (gtmNum == 1):
            dbNode.gtms.append(gtmInst)
            self.gtmcount += 1
        else:
            dbNode.gtms = []

    def __unpackDataNode(self, fp, dbNode):
        """  
        function : Unpack the info of datanode
        input : file Object
        output : NA
        """
        info = fp.read(4)
        (dataNodeNums,) = struct.unpack("=I", info)
        dbNode.dataNum = 0

        dbNode.datanodes = []
        for i in range(dataNodeNums):
            dnInst = instanceInfo()
            dnInst.instanceRole = INSTANCE_ROLE_DATANODE
            dnInst.hostname = dbNode.name
            # In the upgrade scenario, there are two different read methods
            # for static config file.
            # First, use the new read mode, and judge that if the new read
            # mode is not correct,
            # then rollback by fp.seek(), and exchange its(xlogdir) value
            # with ssddir.
            info = fp.read(2056)
            (dnInst.instanceId, dnInst.mirrorId, dnInst.datadir,
             dnInst.xlogdir) = struct.unpack("=II1024s1024s", info)
            dnInst.datadir = dnInst.datadir.decode().strip('\x00')
            dnInst.xlogdir = dnInst.xlogdir.decode().strip('\x00')

            info = fp.read(1024)
            (dnInst.ssdDir) = struct.unpack("=1024s", info)
            dnInst.ssdDir = dnInst.ssdDir[0].decode().strip('\x00')
            # if notsetXlog,ssdDir should not be null.use by upgrade.
            if dnInst.ssdDir != "" and dnInst.ssdDir[0] != '/':
                fp.seek(fp.tell() - 1024)
                dnInst.ssdDir = dnInst.xlogdir
                dnInst.xlogdir = ""

            self.__unPackIps(fp, dnInst.listenIps)
            info = fp.read(8)
            (dnInst.port, dnInst.instanceType) = struct.unpack("=II", info)
            if (dnInst.instanceType == MASTER_INSTANCE):
                dbNode.dataNum += 1
            elif (dnInst.instanceType in [STANDBY_INSTANCE,
                                          DUMMY_STANDBY_INSTANCE, CASCADE_STANDBY]):
                pass
            else:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51204"]
                                % ("DN", dnInst.instanceType))
            self.__unPackIps(fp, dnInst.haIps)
            info = fp.read(4)
            (dnInst.haPort,) = struct.unpack("=I", info)
            if (
                    self.clusterType ==
                    CLUSTER_TYPE_SINGLE_PRIMARY_MULTI_STANDBY or
                    self.clusterType == CLUSTER_TYPE_SINGLE_INST):
                maxStandbyCount = MIRROR_COUNT_REPLICATION_MAX - 1
                for j in range(maxStandbyCount):
                    peerDbInst = peerInstanceInfo()
                    info = fp.read(1024)
                    (peerDbInst.peerDataPath,) = struct.unpack("=1024s", info)
                    peerDbInst.peerDataPath = \
                        peerDbInst.peerDataPath.decode().strip('\x00')
                    self.__unPackIps(fp, peerDbInst.peerHAIPs)
                    info = fp.read(8)
                    (peerDbInst.peerHAPort,
                     peerDbInst.peerRole) = struct.unpack("=II", info)
                    dnInst.peerInstanceInfos.append(peerDbInst)
            else:
                peerDbInst = peerInstanceInfo()
                info = fp.read(1024)
                (peerDbInst.peerDataPath,) = struct.unpack("=1024s", info)
                peerDbInst.peerDataPath = \
                    peerDbInst.peerDataPath.decode().strip('\x00')
                self.__unPackIps(fp, peerDbInst.peerHAIPs)
                info = fp.read(8)
                (peerDbInst.peerHAPort, peerDbInst.peerRole) = \
                    struct.unpack("=II", info)
                info = fp.read(1024)
                (peerDbInst.peerData2Path,) = struct.unpack("=1024s", info)
                peerDbInst.peerData2Path = \
                    peerDbInst.peerDataPath.decode().strip('\x00')
                self.__unPackIps(fp, peerDbInst.peer2HAIPs)
                info = fp.read(8)
                (peerDbInst.peer2HAPort, peerDbInst.peer2Role) = \
                    struct.unpack("=II", info)
                dnInst.peerInstanceInfos.append(peerDbInst)
            dbNode.datanodes.append(dnInst)

    def setInstId(self, instList, nodeIdInstIdDict, newNodeId, newInstId):
        """
        instList                  instance list
        nodeIdInstIdDict          node id and instance id dict
        newNodeId                 new node id
        newInstId                 new instance id
        
        """
        for inst in instList:
            if (newNodeId in list(nodeIdInstIdDict.keys())):
                inst.instanceId = nodeIdInstIdDict[newNodeId]
            # the New agent instance
            else:
                inst.instanceId = newInstId
                newInstId += 1
        return newInstId

    def refreshInstIdByInstType(self, oldNodesList, newNodesList,
                                instType="cmagent"):
        """
        """
        nodeIdInstanceIdDict = {}
        # get the node id and cmagent/cmserver/gtm/etcd/cn instance id dict
        for oldNode in oldNodesList:
            if (instType == "cmagent"):
                for cmaInst in oldNode.cmagents:
                    nodeIdInstanceIdDict[oldNode.id] = cmaInst.instanceId
            elif (instType == "cmserver"):
                for cmsInst in oldNode.cmservers:
                    nodeIdInstanceIdDict[oldNode.id] = cmsInst.instanceId
            elif (instType == "gtm"):
                for gtmInst in oldNode.gtms:
                    nodeIdInstanceIdDict[oldNode.id] = gtmInst.instanceId
            elif (instType == "etcd"):
                for etcdInst in oldNode.etcds:
                    nodeIdInstanceIdDict[oldNode.id] = etcdInst.instanceId
            elif (instType == "cn"):
                for cnInst in oldNode.coordinators:
                    # warm-standby: the number of nodes is same,so refrush
                    # by id
                    # addcn out cluster:refrush by id or nodename
                    # addcn in cluster:refrush by id or nodename
                    # deletecn out cluster:refrush by nodename
                    # deletecn in cluster:refrush by id or nodename
                    # expand:refrush by id or nodename
                    # shink in tail:refrush by id or nodename
                    # shink in mid:refrush by nodename
                    if (len(oldNodesList) == len(newNodesList)):
                        nodeIdInstanceIdDict[oldNode.id] = cnInst.instanceId
                    else:
                        nodeIdInstanceIdDict[oldNode.name] = cnInst.instanceId

        # sort instance id lists and set newInstId = the max ID num + 1
        instIDList = list(nodeIdInstanceIdDict.values())
        instIDList.sort()
        if (len(instIDList) > 0):
            newInstId = instIDList[-1] + 1
        else:
            newInstId = 1

        # refresh instance id by oldClusterInfo
        for newNode in newNodesList:
            if (instType == "cmagent"):
                newInstId = self.setInstId(newNode.cmagents,
                                           nodeIdInstanceIdDict, newNode.id,
                                           newInstId)
            elif (instType == "cmserver"):
                newInstId = self.setInstId(newNode.cmservers,
                                           nodeIdInstanceIdDict, newNode.id,
                                           newInstId)
            elif (instType == "gtm"):
                newInstId = self.setInstId(newNode.gtms, nodeIdInstanceIdDict,
                                           newNode.id, newInstId)
            elif (instType == "etcd"):
                newInstId = self.setInstId(newNode.etcds, nodeIdInstanceIdDict,
                                           newNode.id, newInstId)
            elif (instType == "cn"):
                if (len(oldNodesList) == len(newNodesList)):
                    newInstId = self.setInstId(newNode.coordinators,
                                               nodeIdInstanceIdDict,
                                               newNode.id, newInstId)
                else:
                    newInstId = self.setInstId(newNode.coordinators,
                                               nodeIdInstanceIdDict,
                                               newNode.name, newInstId)

    def __check_cms_config(self):
        """
        Check cm_server config
        """

        if self.cmscount > 0 and len(self.dbNodes) < 2:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] +
                                "The cm_server instance can be "
                                "configured only on three or more nodes.")
        if 0 < self.cmscount < 2:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] +
                            "At least three cm_server instances are required.")


    def checkXMLFile(self, xml_file):
        """
        function : check XML contain DTDs
        input : String
        output : NA
        """
        # Check xml for security requirements
        # if it have "<!DOCTYPE" or it have "<!ENTITY",
        # exit and print "File have security risks."
        try:
            with open(xml_file, "r", encoding='utf-8') as fb:
                lines = fb.readlines()
            for line in lines:
                if re.findall("<!DOCTYPE", line) or re.findall("<!ENTITY", line):
                    raise Exception("File have security risks.")
        except Exception as e:
            raise Exception(str(e))

    def initParserXMLFile(self, xml_file_path):
        """
        function : Init parser xml file
        input : String
        output : Object
        """
        try:
            # check xml for security requirements
            self.checkXMLFile(xml_file_path)
            dom_tree = ETree.parse(xml_file_path)
            root_node = dom_tree.getroot()
        except Exception as e:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] + " Error: \n%s." % str(e))

        return root_node
    @staticmethod
    def readOneClusterConfigItem(root_node, para_name, input_element_name,
                                 nodeName=""):
        """
        function : Read one cluster configuration item
        input : Object,String,String
        output : String,String
        """
        # if read node level config item, should input node name
        if input_element_name.upper() == 'node'.upper() and nodeName == "":
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51201"] + \
                            " Need node name for node configuration level.")

        element_name = input_element_name.upper()
        return_value = ""
        return_status = 2

        if element_name == 'cluster'.upper():
            if not root_node.findall('CLUSTER'):
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51200"] % element_name)
            element = root_node.findall('CLUSTER')[0]
            nodeArray = element.findall('PARAM')
            #ClusterConfigFile.validate_param_names_in_cluster(nodeArray)
            (return_status, return_value) = dbClusterInfo.findParamInCluster(para_name, nodeArray)
        elif element_name == 'node'.upper():
            element_name = 'DEVICELIST'
            if not root_node.findall('DEVICELIST'):
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51200"] % element_name)
            device_array = root_node.findall('DEVICELIST')[0]
            device_node = device_array.findall('DEVICE')
            #ClusterConfigFile.validate_param_names_in_devicelist(device_node)
            (return_status, return_value) = dbClusterInfo.findParamByName(nodeName, para_name, device_node)
        else:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51200"] % element_name)
        return (return_status, return_value)

    def findParamInCluster(para_name, node_array):
        """
        function : Find parameter in cluster
        input : String,[]
        output : String,String
        """
        return_value = ""
        return_status = 2
        for node in node_array:
            name = node.attrib['name']
            if name == para_name:
                return_status = 0
                return_value = str(node.attrib['value'])
                break
        return return_status, return_value

    def findParamByName(node_name, para_name, device_node):
        """
        function : Find parameter by name
        input : String,String,Object
        output : String,String
        """
        return_value = ""
        return_status = 2
        for dev in device_node:
            param_list = dev.findall('PARAM')
            for param in param_list:
                thisname = param.attrib['name']
                if thisname == 'name':
                    value = param.attrib['value']
                    if node_name == value:
                        for param in param_list:
                            name = param.attrib['name']
                            if name == para_name:
                                return_status = 0
                                return_value = str(param.attrib['value'].strip())
                                if ((name.find("Dir") > 0 or name.find(
                                        "dataNode") == 0) and return_value != ""):
                                    return_value = os.path.normpath(return_value)
        return return_status, return_value

    def initFromXml(self, xmlFile):
        """
        function : Init cluster from xml config file
        input : file Object for OLAP
                dbClusterInfo instance
                inherit: instance id inherit from the old cluster.
                append: instance id append to the old cluster.
        output : NA
        """
        if (not os.path.exists(xmlFile)):
            raise Exception("XML configuration file not exist")

        self.xmlFile = xmlFile

        # Set the environment variable, then the readcluster command can
        # read from it.
        os.putenv(ENV_CLUSTERCONFIG, xmlFile)
        # parse xml file
        global xmlRootNode
        try:
            xmlRootNode = self.initParserXMLFile(xmlFile)
        except Exception as e:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51234"]
                            % xmlFile + " Error:\n%s" % str(e))

        self.__readClusterGlobalInfo()
        self.get_cluster_back_ip1s()
        if self.__read_and_check_config_item(xmlRootNode, "clusterType", "cluster", True) == \
                "single-inst-one-node":
            self.__read_cluster_node_info_for_one()
        else:
            self.__readClusterNodeInfo()
        self.__checkAZForSingleInst()
        IpPort = self.__checkInstancePortandIP()
        self.__check_cms_config()
        return IpPort

    def __read_cluster_node_info_for_one(self):
        """
        function : Read cluster node info.
        input : NA
        output : NA
        """
        # read cluster node info.
        (_, node_name) = self.readOneClusterConfigItem(xmlRootNode,
                                                                    "nodeNames",
                                                                    "cluster")
        if [node_name] != self.__getAllHostnamesFromDEVICELIST():
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] +
                            " The number of nodeNames and DEVICE are not same.")
        if (not self.__isIpValid(self.cluster_back_ip1s[0])):
            raise Exception(ErrorCode.GAUSS_506["GAUSS_50603"] + \
                            "The IP address is: %s." % self.cluster_back_ip1s[0] + " Please "
                                                                                 "check it.")
        # Get basic info of node: name, ip and master instance number etc.
        self.dbNodes = []
        db_node = dbNodeInfo(1, node_name)
        self.__readNodeBasicInfo(db_node, [node_name])
        self.dbNodes.append(db_node)
        # Get datanode info
        for i in range(db_node.dataNum):
            db_inst = instanceInfo(BASE_ID_DATANODE + i, 1)
            db_inst.hostname = node_name
            db_inst.datadir = self.__readNodeStrValue(node_name, "dataNode%s" % (i+1))
            db_inst.instanceType = MASTER_INSTANCE if i == 0 else STANDBY_INSTANCE
            db_inst.instanceRole = INSTANCE_ROLE_DATANODE
            db_inst.listenIps = db_node.backIps[:]
            db_inst.haIps = db_node.backIps[:]
            db_inst.port = self.__readNodeIntValue(node_name, "dataPortBase%s" % (i+1))
            db_inst.haPort = db_inst.port + 1
            db_inst.ssdDir = ""
            db_inst.syncNum = -1
            db_inst.syncNumFirst = ""
            db_inst.azName = db_node.azName
            db_inst.azPriority = db_node.azPriority
            db_inst.ssh_port = db_node.ssh_port
            self.dbNodes[0].datanodes.append(db_inst)
        self.dbNodes[0].appendInstance(1, MIRROR_ID_AGENT, INSTANCE_ROLE_CMAGENT,
                                       INSTANCE_TYPE_UNDEFINED, [], None, "")

    def getClusterNodeNames(self):
        """
        function : Get the cluster's node names.
        input : NA
        output : NA
        """
        return [dbNode.name for dbNode in self.dbNodes]

    def getClusterNodeIds(self):
        """
        function : Get the cluster's node names.
        input : NA
        output : NA
        """
        return [dbNode.id for dbNode in self.dbNodes]

    def get_cluster_node_ssh_port_by_ip(self, ip):
        """
        function : Get the cluster's node ssh port by ip.
        input : NA
        output : NA
        """
        for node in self.dbNodes:
            if ip == node.sshIps[0]:
                return node.ssh_port
        return 22
            
    def get_cluster_nodes_ssh_port_by_ips(self, ips):
        """
        function : Get the cluster's node ssh port by ips.
        input : NA
        output : NA
        """
        ssh_ports_map = {}
        for ip in ips:
            ssh_port = self.get_cluster_node_ssh_port_by_ip(ip)
            ssh_ports_map[ip] = ssh_port
        return ssh_ports_map

    def getdataNodeInstanceType(self, nodeId=-1):
        """
        function: get the dataNode's instanceType
        input:  NA
        output: NA
        """
        for dbNode in self.dbNodes:
            if nodeId == dbNode.id:
                for dataNode in dbNode.datanodes:
                    return dataNode.instanceType

    def getHostNameByNodeId(self, nodeId=-1):
        """
        function: get the dataNode's name by nodeId
        input:  NA
        output: NA
        """
        for dbNode in self.dbNodes:
            if nodeId == dbNode.id:
                return dbNode.name

    def get_cluster_directory_dict(self):
        """
        function : Get cluster all directorys
        input : NA
        output : List
        """
        cluster_dirs = dict()
        cluster_dirs["appPath"] = [self.appPath]
        cluster_dirs["logPath"] = [self.logPath]
        # get cluster all directorys
        for db_node in self.dbNodes:
            # including cm_server, cm_agent, cn, dn, gtm, etcd, ssd
            cn_dict =  dict(data_dir="", ssd="")
            dn_dict = dict(data_dir=list(), ssd=list(), xlog_dir=list())
            node_dict = dict(cm_server="", cm_agent="", cn=cn_dict,
                             dn=dn_dict, gtm="", etcd="", ssd="")
            if db_node.cmservers:
                node_dict["cm_server"] = db_node.cmservers[0].datadir
            if db_node.cmagents:
                node_dict["cm_agent"] = db_node.cmagents[0].datadir
            if db_node.gtms:
                node_dict["gtm"] = db_node.gtms[0].datadir
            if db_node.coordinators:
                node_dict["cn"]["data_dir"] = db_node.coordinators[0].datadir
                if db_node.coordinators[0].ssdDir:
                    node_dict["cn"]["ssd"] = db_node.coordinators[0].ssdDir
            for dbInst in db_node.datanodes:
                node_dict["dn"]["data_dir"].append(dbInst.datadir)
                node_dict["dn"]["xlog_dir"].append(dbInst.xlogdir)
                if dbInst.ssdDir:
                    node_dict["dn"]["ssd"].append(dbInst.ssdDir)
            if db_node.etcds:
                node_dict["etcd"] = db_node.etcds[0].datadir
            cluster_dirs[db_node.name] = node_dict
        return cluster_dirs


    def getClusterDirectorys(self, hostName="", ignore=True):
        """
        function : Get cluster all directorys
        input : NA
        output : List
        """
        clusterDirs = {}
        clusterDirs["appPath"] = [self.appPath]
        if (ignore):
            clusterDirs["logPath"] = [self.logPath]
        # get cluster all directorys
        for dbNode in self.dbNodes:
            nodeName = dbNode.name
            if (hostName != ""):
                if (hostName != nodeName):
                    continue
            nodeDirs = []
            # including cm_server, cm_agent, cn, dn, gtm, etcd, ssd
            nodeDirs.append(dbNode.cmDataDir)
            for dbInst in dbNode.cmservers:
                nodeDirs.append(dbInst.datadir)
            for dbInst in dbNode.cmagents:
                nodeDirs.append(dbInst.datadir)
            for dbInst in dbNode.gtms:
                nodeDirs.append(dbInst.datadir)
            for dbInst in dbNode.coordinators:
                nodeDirs.append(dbInst.datadir)
                if (len(dbInst.ssdDir) != 0):
                    nodeDirs.append(dbInst.ssdDir)
            for dbInst in dbNode.datanodes:
                nodeDirs.append(dbInst.datadir)
                nodeDirs.append(dbInst.xlogdir)
                if (len(dbInst.ssdDir) != 0):
                    nodeDirs.append(dbInst.ssdDir)
            for dbInst in dbNode.etcds:
                nodeDirs.append(dbInst.datadir)
            clusterDirs[nodeName] = nodeDirs
        return clusterDirs

    def getDbNodeByName(self, name):
        """
        function : Get node by name.
        input : nodename
        output : []
        """
        for dbNode in self.dbNodes:
            if (dbNode.name == name):
                return dbNode

        return None
    
    def setDbNodeInstancdIdByName(self, name, instanceId):
        for dbNode in self.dbNodes:
            if dbNode.name == name and len(dbNode.datanodes) > 0:
                dbNode.datanodes[0].instanceId = instanceId
                return

    def getPeerInstance(self, dbInst):
        """  
        function : Get peer instance of specified instance.
        input : []
        output : []
        """
        instances = []
        for dbNode in self.dbNodes:
            for inst in dbNode.datanodes:
                if (inst.mirrorId == dbInst.mirrorId and
                        inst.instanceId != dbInst.instanceId):
                    instances.append(inst)
        if instances:
            instances.sort(key=lambda inst: inst.instanceId)
        return instances

    def getClusterBackIps(self):
        """
        function : Get cluster back IP.
        input : NA
        output : []
        """
        backIps = []
        backIpNum = []
        # get backIp number
        for dbNode in self.dbNodes:
            backIpNum.append(len(dbNode.backIps))
        if max(backIpNum) != min(backIpNum):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51227"] % "backIps")
        for num in range(backIpNum[0]):
            ips = []
            for dbNode in self.dbNodes:
                ips.append(dbNode.backIps[num])
            backIps.extend(ips)
        return self.compress_ips(backIps)

    def getClusterSshIps(self):
        """
        function : Get cluster ssh IP.
        input : NA
        output : []
        """
        sshIps = []
        sshIpNum = []
        # get sshIp number
        for dbNode in self.dbNodes:
            sshIpNum.append(len(dbNode.sshIps))
        if max(sshIpNum) != min(sshIpNum):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51227"] % "sshIps")
        for num in range(sshIpNum[0]):
            ips = []
            for dbNode in self.dbNodes:
                ips.append(dbNode.sshIps[num])
            sshIps.append(self.compress_ips(ips))
        return sshIps

    def getazNames(self):
        """
        """
        azMap = {}
        azNames = []
        for dbNode in self.dbNodes:
            azMap[dbNode.azName] = []
            if (dbNode.azName not in azNames):
                azNames.append(dbNode.azName)
        for dbNode in self.dbNodes:
            azMap[dbNode.azName].append(dbNode.azPriority)
        for azName in azNames:
            azMap[azName] = max(azMap[azName])
        azNames = sorted(azMap, key=lambda x: azMap[x])
        return azNames

    def getNodeNameByBackIp(self, backIp):
        """
        function : Get Nodename by backip.
        input : String
        output : String
        """
        nodeName = ""
        for dbNode in self.dbNodes:
            if (backIp in dbNode.backIps):
                nodeName = dbNode.name
                break
        return nodeName

    def __checkInstancePortandIP(self):
        """
        function : Check instance Port and IP.
        input : NA
        output : NA
        """
        nodeipport = {}
        for dbNode in self.dbNodes:
            nodeips = []
            nodeports = []
            cmsListenIPs = []
            ipCheckMap = {}
            backIP1 = dbNode.backIps[0]
            nodeips.extend(dbNode.backIps)
            nodeips.extend(dbNode.sshIps)
            # Check whether the ip addresses of the cluster block and device are consistent
            if backIP1 != self.cluster_back_ip1s[self.node_num]:
                raise Exception(ErrorCode.GAUSS_506["GAUSS_50625"] +
                                "These ip addresses are %s and %s" % (self.cluster_back_ip1s[self.node_num], backIP1)
                                + ". Please check it.")
            self.node_num += 1
            # get node ip and node port from cmserver
            for cmsInst in dbNode.cmservers:
                nodeips.extend(cmsInst.listenIps)
                nodeips.extend(cmsInst.haIps)
                cmsListenIPs = cmsInst.listenIps
                ipCheckMap["cmServerListenIp1"] = cmsInst.listenIps[0]
                ipCheckMap["cmServerHaIp1"] = cmsInst.haIps[0]
                nodeports.append(cmsInst.port)
                nodeports.append(cmsInst.haPort)
            # get node ip and node port from gtm
            for gtmInst in dbNode.gtms:
                nodeips.extend(gtmInst.listenIps)
                nodeips.extend(gtmInst.haIps)
                nodeports.append(gtmInst.port)
                nodeports.append(gtmInst.haPort)
            # get node ip and node port from cn
            for cooInst in dbNode.coordinators:
                nodeips.extend(cooInst.listenIps)
                nodeips.extend(cooInst.haIps)
                nodeports.append(cooInst.port)
                nodeports.append(cooInst.haPort)
            # get node ip and node port from dn
            for dnInst in dbNode.datanodes:
                nodeips.extend(dnInst.listenIps)
                nodeips.extend(dnInst.haIps)
                nodeports.append(dnInst.port)
                nodeports.append(dnInst.haPort)
                if (self.checkSctpPort):
                    nodeports.append(dnInst.port +
                                     dbNode.getDnNum(dnInst.instanceType) * 2)
            # get node ip and node port from etcd
            for etcdInst in dbNode.etcds:
                nodeips.extend(etcdInst.listenIps)
                nodeips.extend(etcdInst.haIps)
                nodeports.append(etcdInst.port)
                nodeports.append(etcdInst.haPort)
                ipCheckMap["etcdListenIp1"] = etcdInst.listenIps[0]
                ipCheckMap["etcdHaIp1"] = etcdInst.haIps[0]
                if (len(etcdInst.listenIps) > 1):
                    etcdListenIp2 = etcdInst.listenIps[1]
                    if (etcdListenIp2 != backIP1):
                        raise Exception(ErrorCode.GAUSS_512["GAUSS_51220"] % (
                                "%s with etcdListenIp2" % etcdListenIp2) +
                                        " Error: \nThe IP address must be "
                                        "the same as the backIP1 %s." %
                                        backIP1)

            # CMS IP must be consistent with CMA IP
            cmaListenIPs = dbNode.cmagents[0].listenIps
            if (cmsListenIPs and cmsListenIPs != cmaListenIPs):
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51220"] % (
                        "%s with cm_server" % cmsListenIPs) +
                                " Error: \nThe IP address must be the same "
                                "as the cm_agent %s." % cmaListenIPs)
            if (g_networkType == 1):
                # Check
                ipCheckMap["cmAgentConnectIp1"] = cmaListenIPs[0]
                if (len(set(ipCheckMap.values())) != 1):
                    errMsg = " Error: \nThe following IPs must be consistent:"
                    for ipConfigItem in list(ipCheckMap.keys()):
                        errMsg += "\n%s: %s" % (
                            ipConfigItem, ipCheckMap[ipConfigItem])
                    raise Exception(ErrorCode.GAUSS_512["GAUSS_51220"] % (
                        "with cm and etcd") + errMsg)
            # create a dictionary
            nodeipport[dbNode.name] = [nodeips, nodeports]
            # check port and ip
            self.__checkPortandIP(nodeips, nodeports, dbNode.name)
        return nodeipport

    def __checkPortandIP(self, ips, ports, name):
        """  
        function : Check  port and IP.
        input : String,int,string
        output : NA
        """
        ipsCopy = list(set(ips))
        portsCopy = list(set(ports))
        for port in portsCopy:
            if (not self.__isPortValid(port)):
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51233"]
                                % (port, name) + " Please check it.")

        for ip in ipsCopy:
            if (not self.__isIpValid(ip)):
                raise Exception(ErrorCode.GAUSS_506["GAUSS_50603"] + \
                                "The IP address is: %s." % ip + " Please "
                                                                "check it.")
            self.ips_type.append(get_ip_version(ip))
            if len(set(self.ips_type)) > 1 or (len(set(self.ips_type)) == 1 and ("" in set(self.ips_type))):
                raise Exception(ErrorCode.GAUSS_506["GAUSS_50624"] +
                                "The types of these ip addresses are %s" % self.ips_type + ". Please "
                                                                                           "check it.")

    @staticmethod
    def __read_and_check_config_item(root_node, para, root_type, error_ignore=False):
        """
        function : Read one cluster configuration item and check path valid
        input : root_node: RootNode
                para: param_name
                root_type: clusterType or node
                error_ignore: boolean
        output : String
        """
        status, output = dbClusterInfo.readOneClusterConfigItem(
            root_node, para, root_type)
        if status != 0 and not error_ignore:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % para + " Error: \n%s" % output)
        if output.strip() and para == "installPath":
            output = os.path.normpath(output.strip())
        else:
            output = output.strip()

        if output:
            checkPathVaild(output)
        return output

    def __readClusterGlobalInfo(self):
        """
        Read cluster info from xml config's <CLUSTER> tag except nodeNames,
        clusterRings and sqlExpandNames info
        :return: NA
        """
        global g_networkType
        self.clusterType = CLUSTER_TYPE_SINGLE_INST

        # Read cluster name
        self.name = self.__read_and_check_config_item(xmlRootNode, "clusterName", "cluster")

        self.installPath = self.__read_and_check_config_item(xmlRootNode, "installPath", "cluster")
        self.appPath = os.path.join(self.installPath, "APP")
        self.appSoftPath = self.appPath
        self.toolPath = os.path.join(self.installPath, "tool")
        self.tmpPath = os.path.join(self.installPath, "tmp")
        self.logPath = os.path.join(self.installPath, "log")
        self.grPath = os.path.join(self.installPath, "gr")
        self.wormPath = self.__read_and_check_config_item(xmlRootNode, "wormPath", "cluster", False)
        if not self.wormPath:
            self.wormPath = os.path.join(self.installPath, "data")
        self.caPath = self.__read_and_check_config_item(xmlRootNode, "caPath", "cluster", False)
        if not self.caPath:
            self.caPath = os.path.join(self.wormPath, "CA")

        if not self.logPath:
            self.logPath = "/var/log/gaussdb"
        if not os.path.isabs(self.logPath):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50213"] % \
                            ("%s log path(%s)" % (
                                VersionInfo.PRODUCT_NAME, self.logPath)))

        # Read enable_dcf
        ret_status, self.enable_dcf = self.readOneClusterConfigItem(xmlRootNode,
                                                               "enable_dcf",
                                                               "cluster")
        if self.enable_dcf not in ['', 'on', 'off']:
            raise Exception(ErrorCode.GAUSS_500["GAUSS_50011"] %
                                ('enable_dcf', self.enable_dcf))

        if self.enable_dcf == 'on':
            (ret_status, ret_value) = self.readOneClusterConfigItem(
                xmlRootNode, "dcf_config", "CLUSTER")
            if ret_status == 0:
                self.dcf_config = ret_value.strip()
                if self.dcf_config.count('role') - self.dcf_config.count('PASSIVE') < 3:
                    raise Exception(ErrorCode.GAUSS_500["GAUSS_50011"] %
                                    ('dcf_config', self.dcf_config))
            else:
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] %
                                'dcf_config' + " Error: \n%s" % ret_value)

        # Read network type
        (retStatus, retValue) = self.readOneClusterConfigItem(
            xmlRootNode, "networkType", "cluster")
        if retStatus == 0:
            if retValue.isdigit() and int(retValue) in [0, 1]:
                g_networkType = int(retValue)
            else:
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                  "cluster network type" + " Error: \nThe parameter value must be 0 or 1.")
        elif retStatus == 2:
            g_networkType = 0
        else:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                            "cluster network type" + " Error: \n%s" % retValue)

        # Read gr info
        status, self.gr_nodes_list = self.readOneClusterConfigItem(
            xmlRootNode, "gr_nodes_list", "cluster")
        status, self.rest_nodes_list = self.readOneClusterConfigItem(
            xmlRootNode, "rest_nodes_list", "cluster")

    def get_cluster_back_ip1s(self):
        # Read cluster backIp1s
        status, output = self.readOneClusterConfigItem(
            xmlRootNode, "backIp1s", "cluster")
        if status != 0:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % "backIp1s" + " Error: \n%s" % output)

        cluster_backip1s_str = output.strip()

        if output:
            self.cluster_back_ip1s = cluster_backip1s_str.split(",")
            self.cluster_back_ip1s = self.compress_ips(self.cluster_back_ip1s)

    def __getAllHostnamesFromDEVICELIST(self):
        """
        function : Read all host name from <DEVICELIST>
        input : Na
        output : str
        """
        if not xmlRootNode.findall('DEVICELIST'):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51200"] % 'DEVICELIST')
        DeviceArray = xmlRootNode.findall('DEVICELIST')[0]
        DeviceNodeList = DeviceArray.findall('DEVICE')
        allNodeName = []
        for dev in DeviceNodeList:
            paramList = dev.findall('PARAM')
            for param in paramList:
                thisname = param.attrib['name']
                if (thisname == 'name'):
                    value = param.attrib['value']
                    allNodeName.append(value)
        return allNodeName

    def __readClusterNodeInfo(self):
        """
        function : Read cluster node info.
        input : NA
        output : NA
        """
        # read cluster node info.
        (retStatus, retValue) = self.readOneClusterConfigItem(xmlRootNode,
                                                                           "nodeNames",
                                                                           "cluster")
        if (retStatus != 0):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                            % "node names" + " Error: \n%s" % retValue)
        nodeNames = []
        backip_types = set()
        nodeNames_tmp = retValue.split(",")
        for back_ip in self.cluster_back_ip1s:
            if (not self.__isIpValid(back_ip)):
                raise Exception(ErrorCode.GAUSS_506["GAUSS_50603"] + \
                                "The IP address is: %s." % back_ip + " Please "
                                                                    "check it.")
            backip_types.add(get_ip_version(back_ip))
            if len(backip_types) > 1 or (len(backip_types) == 1 and ("" in backip_types)):
                raise Exception(ErrorCode.GAUSS_506["GAUSS_50624"] +
                                "The types of these ip addresses are %s" % backip_types + ". Please "
                                                                                          "check it.")
        for nodename in nodeNames_tmp:
            nodeNames.append(nodename.strip())
        if (len(nodeNames) == 0):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                            "cluster configuration" + " There is no node in "
                                                      "cluster configuration"
                                                      " file.")

        if (len(nodeNames) != len(list(set(nodeNames)))):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                            "cluster configuration" + " There contains "
                                                      "repeated node in "
                                                      "cluster configuration "
                                                      "file.")

        # Check node names
        nodeNameList = self.__getAllHostnamesFromDEVICELIST()
        if len(nodeNameList) != len(nodeNames):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] + \
                            " The number of nodeNames and DEVICE are not "
                            "same.")
        for nodeName in nodeNames:
            if nodeName not in nodeNameList:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] + \
                                " Can not found DEVICE for [%s]." % nodeName)
        # Get basic info of node: name, ip and master instance number etc.
        self.dbNodes = []
        i = 1
        for name in nodeNames:
            dbNode = dbNodeInfo(i, name)
            self.__readNodeBasicInfo(dbNode, nodeNames)
            self.dbNodes.append(dbNode)
            i += 1

        # Get cm server info
        for dbNode in self.dbNodes:
            self.__readCmsConfig(dbNode)

        # Get datanode info
        for dbNode in self.dbNodes:
            self.__readDataNodeConfig(dbNode)

        # Get cm agent info
        for dbNode in self.dbNodes:
            self.__readCmaConfig(dbNode)

        # Get gr info
        #for dbNode in self.dbNodes:
        #    self.__readGrConfig(dbNode)

        # set DB port for OLAP
        for node in self.dbNodes:
            for inst in node.datanodes:
                inst.azName = node.azName
                inst.azPriority = node.azPriority
        self.__setNodePortForSinglePrimaryMultiStandby()

    def compress_ips(self, ips):
        # New list to store the compressed IP addresses
        compressed_ips = []

        for ip in ips:
            ip = ip.strip()
            if (not self.__isIpValid(ip)):
                raise Exception(ErrorCode.GAUSS_506["GAUSS_50603"] + \
                    "The IP address is: %s." % ip + " Please check it.")
            # Convert to an IP address object
            ip_obj = ipaddress.ip_address(ip)
            # Obtain the compressed IP
            compressed_ip = ip_obj.compressed
            compressed_ips.append(compressed_ip)
        return compressed_ips

    def __getPeerInstance(self, dbInst):
        """
        function : Get peer instance of specified instance.
        input : []
        output : []
        """
        instances = []
        if (dbInst.instanceRole == INSTANCE_ROLE_CMSERVER):
            for dbNode in self.dbNodes:
                for inst in dbNode.cmservers:
                    if (inst.mirrorId == dbInst.mirrorId and
                            inst.instanceId != dbInst.instanceId):
                        instances.append(inst)
        elif (dbInst.instanceRole == INSTANCE_ROLE_GTM):
            for dbNode in self.dbNodes:
                for inst in dbNode.gtms:
                    if (inst.mirrorId == dbInst.mirrorId and
                            inst.instanceId != dbInst.instanceId):
                        instances.append(inst)
        elif (dbInst.instanceRole == INSTANCE_ROLE_COODINATOR):
            for dbNode in self.dbNodes:
                for inst in dbNode.coordinators:
                    if (inst.mirrorId == dbInst.mirrorId and
                            inst.instanceId != dbInst.instanceId):
                        instances.append(inst)
        elif (dbInst.instanceRole == INSTANCE_ROLE_DATANODE):
            for dbNode in self.dbNodes:
                for inst in dbNode.datanodes:
                    if (inst.mirrorId == dbInst.mirrorId and
                            inst.instanceId != dbInst.instanceId):
                        instances.append(inst)

        return instances

    def __setNodePortForSinglePrimaryMultiStandby(self):
        """
        function : set the standy DB port.
        input : []
        output : NA
        """
        for dbNode in self.dbNodes:
            i = 0
            for dbInst in dbNode.datanodes:
                if (dbInst.instanceType == MASTER_INSTANCE):
                    dbInst.port = dbNode.masterBasePorts[
                                      INSTANCE_ROLE_DATANODE] + i * \
                                  PORT_STEP_SIZE
                    dbInst.haPort = dbInst.port + 1
                    peerInsts = self.__getPeerInstance(dbInst)
                    for j in range(len(peerInsts)):
                        peerInsts[j].port = dbInst.port
                        peerInsts[j].haPort = peerInsts[j].port + 1
                    i += 1
            # flush CMSERVER instance port
            i = 0
            cmsbaseport = 0
            for dbInst in dbNode.cmservers:
                if (dbInst.instanceType == MASTER_INSTANCE):
                    cmsbaseport = dbNode.masterBasePorts[
                        INSTANCE_ROLE_CMSERVER]
                    dbInst.port = cmsbaseport + i * PORT_STEP_SIZE
                    dbInst.haPort = dbInst.port + 1
                    peerInsts = self.__getPeerInstance(dbInst)
                    for j in range(len(peerInsts)):
                        peerInsts[j].port = cmsbaseport
                        peerInsts[j].haPort = peerInsts[j].port + 1
                    i += 1
            # flush GTM instance port
            i = 0
            gtmbaseport = 0
            for dbInst in dbNode.gtms:
                if (dbInst.instanceType == MASTER_INSTANCE):
                    gtmbaseport = dbNode.masterBasePorts[INSTANCE_ROLE_GTM]
                    dbInst.port = gtmbaseport + i * PORT_STEP_SIZE
                    dbInst.haPort = dbInst.port + 1
                    peerInsts = self.__getPeerInstance(dbInst)
                    for j in range(len(peerInsts)):
                        peerInsts[j].port = gtmbaseport
                        peerInsts[j].haPort = peerInsts[j].port + 1
                    i += 1

    def set_cm_info_for_node(self, node, node_names):
        """
        Set CM information for node
        """
        for node_name in node_names:
            if node.cmDataDir.replace(" ", "").find("," + node_name.replace(" ", "") + ",") >= 0:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51235"] %
                                node.cmDataDir +
                                " The cmDir only need one path while you configure "
                                "it with primary and standby cmDir, "
                                "please modify it and try again. "
                                "You can examine the install guide "
                                "for more information to configure xml file.")

        # Get base port
        if node.cmsNum > 0:
            node.masterBasePorts[INSTANCE_ROLE_CMSERVER] = \
                self.__readNodeIntValue(node.name, "cmServerPortBase",
                                        True, MASTER_BASEPORT_CMS)
            if self.isSingleInstCluster():
                node.standbyBasePorts[INSTANCE_ROLE_CMSERVER] = \
                    node.masterBasePorts[INSTANCE_ROLE_CMSERVER]

    def get_local_node_info(self):
        """
        Get local node info
        """
        local_node = [node for node in self.dbNodes if node.id == self.localNodeId]
        return local_node[0] if local_node else None

    def __readNodeBasicInfo(self, dbNode, nodenames):
        """
        function : Read basic info of specified node.
        input : []
        output : NA
        """
        # get backIp
        dbNode.backIps = self.compress_ips(self.__readNodeIps(dbNode.name, "backIp"))
        if (len(dbNode.backIps) == 0):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51207"] % dbNode.name)
        # get sshIp
        dbNode.sshIps = self.compress_ips(self.__readNodeIps(dbNode.name, "sshIp"))
        if (len(dbNode.sshIps) == 0):
            dbNode.sshIps = dbNode.backIps[:]
        # get virtualIp
        dbNode.virtualIp = self.__readVirtualIp(dbNode.name, "virtualIp")
        # get ssh_port
        dbNode.ssh_port = self.__readNodeIntValue(dbNode.name, "sshPort", True, 22)
        # Get cm_server number
        dbNode.cmsNum = self.__readNodeIntValue(dbNode.name, "cmsNum", True, 0)
        # Get gtm number
        dbNode.gtmNum = self.__readNodeIntValue(dbNode.name, "gtmNum", True, 0)
        # Get etcd number
        dbNode.etcdNum = self.__readNodeIntValue(dbNode.name, "etcdNum", True,
                                                 0)
        # Get cn number
        dbNode.cooNum = self.__readNodeIntValue(dbNode.name, "cooNum", True, 0)
        # Get DB number
        dbNode.dataNum = self.__readNodeIntValue(dbNode.name, "dataNum", True,
                                                 0)
        # read cm directory for server and agent
        try:
            dbNode.cmDataDir = self.__readNodeStrValue(dbNode.name, "cmDir")
            self.cm_state_list.append(True)
        except Exception as _:
            self.cm_state_list.append(False)
            if not self.check_conf_cm_state():
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51230"] %
                                ("CM", "has same configure."))
        if self.check_conf_cm_state():
            self.set_cm_info_for_node(dbNode, nodenames)

        # check dataNum
        if dbNode.dataNum < 0:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51208"] % ("dn", dbNode.dataNum))

        # Get base port
        if dbNode.dataNum > 0:
            dbNode.masterBasePorts[INSTANCE_ROLE_DATANODE] = \
                self.__readNodeIntValue(dbNode.name, "dataPortBase",
                                        True, MASTER_BASEPORT_DATA)
            dbNode.standbyBasePorts[INSTANCE_ROLE_DATANODE] = \
                dbNode.masterBasePorts[INSTANCE_ROLE_DATANODE]

        # Get az name
        dbNode.azName = self.__readNodeStrValue(dbNode.name, "azName")
        # check azName
        # Get az Priority
        dbNode.azPriority = self.__readNodeIntValue(dbNode.name, "azPriority",
                                                    True, 0)
        # get cascadeRole
        dbNode.cascadeRole = self.__readNodeStrValue(dbNode.name, "cascadeRole",
                                                     True, "off")
        if (dbNode.azPriority < AZPRIORITY_MIN or
                dbNode.azPriority > AZPRIORITY_MAX):
            raise Exception(ErrorCode.GAUSS_532["GAUSS_53206"] % "azPriority")

        if not dbNode.azName:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51212"] % ("azName"))
        if dbNode.azPriority < 1:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51208"]
                            % ("azPriority", dbNode.azPriority))
        dbNode.grIp1 = self.__readNodeStrValue(dbNode.name, "grIp1")
        dbNode.listen_addr = self.__readNodeStrValue(dbNode.name, "listen_addr")
        # 读取 listen_port,如果不存在则从 listen_addr 解析(兼容旧格式)
        dbNode.listen_port = self.__readNodeStrValue(dbNode.name, "listen_port", True, "")
        if not dbNode.listen_port and dbNode.listen_addr and ":" in dbNode.listen_addr:
            # 兼容旧格式:从 listen_addr 解析端口
            _, port_part = dbNode.listen_addr.split(":", 1)
            dbNode.listen_port = port_part
            # 更新 listen_addr 为只有 IP
            dbNode.listen_addr = dbNode.listen_addr.split(":", 1)[0]

    def __getCmsCountFromWhichConfiguredNode(self, masterNode):
        """
        function : get the count of cmservers if current node configured
        cmserver
        input : masterNode
        output : cmsCount
        """
        cms_list = self.__readNodeStrValue(masterNode.name, "cmServerRelation",
                                          True, "").split(",")
        cms_count = len(cms_list)
        device_count = len(self.__getAllHostnamesFromDEVICELIST())
        if (cms_count == 0):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                            % ("CMServer configuration on host [%s]"
                               % str(masterNode.name))
                            + " The information of %s is wrong."
                            % "cmServerRelation")

        if cms_count != device_count:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                            % "CMServer configuration, "
                               "The num of cmServerRelation's hostname is wrong, "
                               "Please check it.")

        for name_node in cms_list:
            name_node = name_node.strip()
            if name_node not in self.__getAllHostnamesFromDEVICELIST():
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                                % ("The information of %s:%s is wrong.")
                                % ("cmServerRelation", name_node))
        return cms_count

    def __readCmsConfig(self, masterNode):
        """
        function : Read cm server config on node.
        input : []
        output : NA
        """
        self.__readCmsConfigForMutilAZ(masterNode)

    def __readCmsConfigForMutilAZ(self, masterNode):
        """
        """
        cmsListenIps = None
        cmsHaIps = None
        if (masterNode.cmsNum > 0):
            self.cmscount = self.__getCmsCountFromWhichConfiguredNode(
                masterNode)
            cmsListenIps = self.__readInstanceIps(masterNode.name,
                                                  "cmServerListenIp",
                                                  self.cmscount)
            cmsHaIps = self.__readInstanceIps(masterNode.name, "cmServerHaIp",
                                              self.cmscount)

        for i in range(masterNode.cmsNum):
            level = self.__readNodeIntValue(masterNode.name, "cmServerlevel")
            if level == "":
                level = self.__readNodeIntValue(masterNode.name, "cmServerLevel")
            hostNames = []
            hostNames_tmp = \
                self.__readNodeStrValue(masterNode.name,
                                        "cmServerRelation").split(",")
            for hostname in hostNames_tmp:
                hostNames.append(hostname.strip())

            instId = self.__assignNewInstanceId(INSTANCE_ROLE_CMSERVER)
            mirrorId = self.__assignNewMirrorId()
            instIndex = i * self.cmscount
            masterNode.appendInstance(instId, mirrorId, INSTANCE_ROLE_CMSERVER,
                                      MASTER_INSTANCE, cmsListenIps[instIndex],
                                      cmsHaIps[instIndex], "", "", level)

            for j in range(1, self.cmscount):
                dbNode = self.getDbNodeByName(hostNames[j])
                if dbNode is None:
                    raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                                    % ("CMServer configuration on host [%s]"
                                       % masterNode.name)
                                    + " There is no host named %s."
                                    % hostNames[j])
                instId = self.__assignNewInstanceId(INSTANCE_ROLE_CMSERVER)
                instIndex += 1
                dbNode.appendInstance(instId, mirrorId, INSTANCE_ROLE_CMSERVER,
                                      STANDBY_INSTANCE,
                                      cmsListenIps[instIndex],
                                      cmsHaIps[instIndex], "", "", level)

    def __getDataNodeCount(self, masterNode):
        """
        function : get the count of data nodes
        input : masterNode
        output : dataNodeCount
        """
        dataNodeList = self.__readNodeStrValue(masterNode.name,
                                               "dataNode1",
                                               True, "").split(",")
        dnListLen = len(dataNodeList)
        dataNodeCount = (dnListLen + 1) // 2
        return dataNodeCount

    def __readDataNodeConfig(self, masterNode):
        """   
        function : Read datanode config on node.
        input : []
        output : NA
        """
        self.__readDataNodeConfigForMutilAZ(masterNode)

    def __readDataNodeConfigForMutilAZ(self, masterNode):
        """
        """
        dnListenIps = None
        dnHaIps = None
        dn_float_ips = None
        mirror_count_data = self.__getDataNodeCount(masterNode)
        if masterNode.dataNum > 0:
            dnListenIps = self.__readInstanceIps(masterNode.name,
                                                 "dataListenIp",
                                                 masterNode.dataNum *
                                                 mirror_count_data)
            dnHaIps = self.__readInstanceIps(masterNode.name, "dataHaIp",
                                             masterNode.dataNum *
                                             mirror_count_data)
            dn_float_ips = self.__readInstanceIps(masterNode.name,
                                                  "floatIpMap",
                                                  masterNode.dataNum *
                                                  mirror_count_data)
        if dn_float_ips is not None:
            self.__read_cluster_float_ips(dn_float_ips)
        dnInfoLists = [[] for row in range(masterNode.dataNum)]
        xlogInfoLists = [[] for row in range(masterNode.dataNum)]
        dcf_data_lists = [[] for row in range(masterNode.dataNum)]
        ssdInfoList = [[] for row in range(masterNode.dataNum)]
        syncNumList = [-1 for row in range(masterNode.dataNum)]
        syncNumFirstList = [[] for row in range(masterNode.dataNum)]
        totalDnInstanceNum = 0
        # Whether the primary and standby have SET XLOG PATH , must be
        # synchronized
        has_xlog_path = 0
        for i in range(masterNode.dataNum):
            dnInfoList = []
            key = "dataNode%d" % (i + 1)
            dnInfoList_tmp = self.__readNodeStrValue(masterNode.name,
                                                     key).split(",")
            for dnInfo in dnInfoList_tmp:
                dnInfoList.append(dnInfo.strip())
            dnInfoListLen = len(dnInfoList)
            if dnInfoListLen != 2 * mirror_count_data - 1:
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                                ("database node configuration on host [%s]"
                                 % masterNode.name)
                                + " The information of [%s] is wrong." % key)
            totalDnInstanceNum += (dnInfoListLen + 1) // 2
            dnInfoLists[i].extend(dnInfoList)

            # If not set dataNodeXlogPath in xmlfile,just set
            # xlogInfoListLen = 0,Used for judgement.
            # If set dataNodeXlogPath in xmlfile,each datanode needs to have
            # a corresponding xlogdir.
            xlogInfoList = []
            xlogkey = "dataNodeXlogPath%d" % (i + 1)
            xlogInfoList_tmp = self.__readNodeStrValue(masterNode.name,
                                                       xlogkey).split(",")
            for xlogInfo in xlogInfoList_tmp:
                xlogInfoList.append(xlogInfo.strip())

            # This judgment is necessary,if not set dataNodeXlogPath,
            # xlogInfoListLen will equal 1.
            # Because dninfolist must be set, it does not need extra judgment.
            if xlogInfoList_tmp == ['']:
                xlogInfoListLen = 0
            else:
                xlogInfoListLen = len(xlogInfoList)

            if i == 0:
                has_xlog_path = xlogInfoListLen

            if xlogInfoListLen != has_xlog_path:
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                                ("database node configuration on host [%s]"
                                 % masterNode.name)
                                + " The information of [%s] is wrong."
                                % xlogkey)

            if (xlogInfoListLen != 0 and xlogInfoListLen != (dnInfoListLen + 1) // 2):
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                                ("database node configuration on host [%s]"
                                 % masterNode.name)
                                + " The information of [%s] is wrong."
                                % xlogkey)
            xlogInfoLists[i].extend(xlogInfoList)
            dcf_data_list = []
            if self.enable_dcf == "on":
                if self.cmscount < 3:
                    raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] +
                                    "At least three cm_server instances are required.")
                for dcf_info in range(0, mirror_count_data * 2, 2):
                    dcf_data_list.append(dnInfoList_tmp[dcf_info] + '/dcf_data')
                dcf_data_lists[i].extend(dcf_data_list)
            else:
                dcf_data_list = ['' for i in range(mirror_count_data)]

            key = "ssdDNDir%d" % (i + 1)
            # ssd doesn't supply ,so set ssddir value to empty
            ssddirList = []
            if self.enable_dcf == "":
                i = 0
            ssdInfoList[i].extend(ssddirList)

            # dataNode syncNum
            key = "dataNode%d_syncNum" % (i + 1)
            syncNum_temp = self.__readNodeStrValue(masterNode.name, key)
            if syncNum_temp is not None and syncNum_temp != "":
                syncNum = int(syncNum_temp)
                if syncNum < 0 or syncNum >= totalDnInstanceNum:
                    raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                                    ("database node configuration on host [%s]"
                                    % masterNode.name)
                                    + " The information of [%s] is wrong."
                                    % key)
                syncNumList[i] = syncNum

        # check ip num
        if dnListenIps is not None and len(dnListenIps[0]) != 0:
            colNum = len(dnListenIps[0])
            rowNum = len(dnListenIps)
            for col in range(colNum):
                ipNum = 0
                for row in range(rowNum):
                    if dnListenIps[row][col] != "":
                        ipNum += 1
                    else:
                        break
                if ipNum != totalDnInstanceNum:
                    raise Exception(ErrorCode.GAUSS_516["GAUSS_51637"] % \
                                    ("IP number of dataListenIp",
                                     "instance number"))

        if dnHaIps is not None and len(dnHaIps[0]) != 0:
            colNum = len(dnHaIps[0])
            rowNum = len(dnHaIps)
            for col in range(colNum):
                ipNum = 0
                for row in range(rowNum):
                    if dnHaIps[row][col] != "":
                        ipNum += 1
                    else:
                        break
                if ipNum != totalDnInstanceNum:
                    raise Exception(ErrorCode.GAUSS_516["GAUSS_51637"] % \
                                    ("IP number of dataHaIps",
                                     "instance number"))

        instIndex = 0
        for i in range(masterNode.dataNum):
            dnInfoList = dnInfoLists[i]
            key = "syncNode_%s" % (masterNode.name)
            if self.__readNodeStrValue(masterNode.name, key) is not None:
                syncNumFirst_temp = self.__readNodeStrValue(masterNode.name, key)
                if syncNumFirst_temp is not None:
                    syncNumFirst = syncNumFirst_temp
                syncNumFirstList[i] = syncNumFirst

            # Because xlog may not be set to prevent the array from crossing
            # the boundary
            if xlogInfoListLen != 0:
                xlogInfoList = xlogInfoLists[i]
            groupId = self.__assignNewGroupId()
            if len(ssdInfoList[i]) > 1:
                ssddirList = ssdInfoList[i]
            # master datanode
            instId = self.__assignNewInstanceId(INSTANCE_ROLE_DATANODE)
            # ssd doesn't supply ,this branch will not arrive when len(
            # ssdInfoList[i])  is 0
            if len(ssdInfoList[i]) > 1:
                if xlogInfoListLen == 0:
                    if self.enable_dcf == "on":
                        masterNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              MASTER_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[0], ssddirList[0],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              dcf_data=dcf_data_list[0])
                    else:
                        masterNode.appendInstance(instId, groupId,
                                                  INSTANCE_ROLE_DATANODE,
                                                  MASTER_INSTANCE,
                                                  dnListenIps[instIndex],
                                                  dnHaIps[instIndex],
                                                  dnInfoList[0], ssddirList[0],
                                                  syncNum=syncNumList[i],
                                                  syncNumFirst=syncNumFirstList[i])
                else:
                    masterNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              MASTER_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[0], ssddirList[0],
                                              xlogdir=xlogInfoList[0],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i])
            else:
                if xlogInfoListLen == 0:
                    if self.enable_dcf == "on":
                        masterNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              MASTER_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[0],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              dcf_data=dcf_data_list[0],
                                              float_ips=dn_float_ips[instIndex] \
                                              if dn_float_ips else [])
                    else:
                        masterNode.appendInstance(instId, groupId,
                                                  INSTANCE_ROLE_DATANODE,
                                                  MASTER_INSTANCE,
                                                  dnListenIps[instIndex],
                                                  dnHaIps[instIndex],
                                                  dnInfoList[0],
                                                  syncNum=syncNumList[i],
                                                  syncNumFirst=syncNumFirstList[i],
                                                  float_ips=dn_float_ips[instIndex] \
                                                  if dn_float_ips else [])
                else:
                    masterNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              MASTER_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[0],
                                              xlogdir=xlogInfoList[0],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              float_ips=dn_float_ips[instIndex] \
                                              if dn_float_ips else [])

            instIndex += 1

            for nodeLen in range((len(dnInfoList) + 1) // 2 - 1):
                dbNode = self.getDbNodeByName(dnInfoList[nodeLen * 2 + 1])
                if dbNode is None:
                    raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                                    % ("database node configuration on "
                                       "host [%s]" % str(masterNode.name))
                                    + " There is no host named %s."
                                    % dnInfoList[nodeLen * 2 + 1])
                instId = self.__assignNewInstanceId(INSTANCE_ROLE_DATANODE)
                
                syncNumFirstList[i] = ""
                key = "syncNode_%s" % (dbNode.name)
                if self.__readNodeStrValue(dbNode.name, key) is not None:
                    syncNumFirst_temp = self.__readNodeStrValue(dbNode.name, key)
                    if syncNumFirst_temp is not None:
                        syncNumFirst = syncNumFirst_temp
                    syncNumFirstList[i] = syncNumFirst

                # ssd doesn't supply ,this branch will not arrive when len(
                # ssdInfoList[i])  is 0
                if len(ssdInfoList[i]) > 1:
                    if xlogInfoListLen == 0:
                        if self.enable_dcf == "on":
                            dbNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              STANDBY_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[nodeLen * 2 + 2],
                                              ssddirList[nodeLen * 2 + 1],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              dcf_data=dcf_data_list[0])
                        else:
                            dbNode.appendInstance(instId, groupId,
                                                  INSTANCE_ROLE_DATANODE,
                                                  STANDBY_INSTANCE,
                                                  dnListenIps[instIndex],
                                                  dnHaIps[instIndex],
                                                  dnInfoList[nodeLen * 2 + 2],
                                                  ssddirList[nodeLen * 2 + 1],
                                                  syncNum=syncNumList[i],
                                                  syncNumFirst=syncNumFirstList[i])
                    else:
                        if self.enable_dcf == "on":
                            dbNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              STANDBY_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[nodeLen * 2 + 2],
                                              ssddirList[nodeLen * 2 + 1],
                                              xlogdir=xlogInfoList[nodeLen + 1],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              dcf_data=dcf_data_list[0])
                        else:
                            dbNode.appendInstance(instId, groupId,
                                                  INSTANCE_ROLE_DATANODE,
                                                  STANDBY_INSTANCE,
                                                  dnListenIps[instIndex],
                                                  dnHaIps[instIndex],
                                                  dnInfoList[nodeLen * 2 + 2],
                                                  ssddirList[nodeLen * 2 + 1],
                                                  xlogdir=xlogInfoList[nodeLen + 1],
                                                  syncNum=syncNumList[i],
                                                  syncNumFirst=syncNumFirstList[i])
                else:
                    if xlogInfoListLen == 0:
                        if self.enable_dcf == "on":
                            dbNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              STANDBY_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[nodeLen * 2 + 2],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              dcf_data=dcf_data_list[0],
                                              float_ips=dn_float_ips[instIndex] \
                                              if dn_float_ips else [])
                        else:
                            dbNode.appendInstance(instId, groupId,
                                                  INSTANCE_ROLE_DATANODE,
                                                  STANDBY_INSTANCE,
                                                  dnListenIps[instIndex],
                                                  dnHaIps[instIndex],
                                                  dnInfoList[nodeLen * 2 + 2],
                                                  syncNum=syncNumList[i],
                                                  syncNumFirst=syncNumFirstList[i],
                                                  float_ips=dn_float_ips[instIndex] \
                                                  if dn_float_ips else [])
                    else:
                        if self.enable_dcf == "on":
                            dbNode.appendInstance(instId, groupId,
                                              INSTANCE_ROLE_DATANODE,
                                              STANDBY_INSTANCE,
                                              dnListenIps[instIndex],
                                              dnHaIps[instIndex],
                                              dnInfoList[nodeLen * 2 + 2],
                                              xlogdir=xlogInfoList[nodeLen + 1],
                                              syncNum=syncNumList[i],
                                              syncNumFirst=syncNumFirstList[i],
                                              dcf_data=dcf_data_list[0],
                                              float_ips=dn_float_ips[instIndex] \
                                              if dn_float_ips else [])
                        else:
                            dbNode.appendInstance(instId, groupId,
                                                  INSTANCE_ROLE_DATANODE,
                                                  STANDBY_INSTANCE,
                                                  dnListenIps[instIndex],
                                                  dnHaIps[instIndex],
                                                  dnInfoList[nodeLen * 2 + 2],
                                                  xlogdir=xlogInfoList[nodeLen + 1],
                                                  syncNum=syncNumList[i],
                                                  syncNumFirst=syncNumFirstList[i],
                                                  float_ips=dn_float_ips[instIndex] \
                                                  if dn_float_ips else [])
                if dbNode.cascadeRole == "on":
                    if self.enable_dcf != "on":
                        for inst in dbNode.datanodes:
                            inst.instanceType = CASCADE_STANDBY
                    else:
                        raise Exception(ErrorCode.GAUSS_512["GAUSS_51244"] %
                                        "In DCF mode cascadeRole")
                instIndex += 1

        for inst in masterNode.datanodes:
            inst.azName = masterNode.azName

    @staticmethod
    def append_map_ip_into_global(strem_ip_map):
        """append_map_ip_into_global"""
        shard_map = []
        ip_map_list = [i.strip().strip("),").strip(",(") for i in strem_ip_map.split("(") if i]
        for ip_map in ip_map_list:
            peer_ip_map = ip_map.split(",")
            temp_dict = dict()
            if len(peer_ip_map) != 2:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] +
                                " check localStreamIpmap is correct")
            temp_dict["ip"] = peer_ip_map[0].strip()
            SecurityChecker.check_ip_valid(temp_dict["ip"],  temp_dict["ip"])
            temp_dict["dataIp"] = peer_ip_map[1].strip()
            SecurityChecker.check_ip_valid(temp_dict["dataIp"],  temp_dict["dataIp"])
            shard_map.append(temp_dict)
        return shard_map

    def __readCmaConfig(self, dbNode):
        """ 
        function : Read cm agent config on node.
        input : []
        output : NA
        """
        agentIps = self.__readInstanceIps(dbNode.name, "cmAgentConnectIp", 1)
        instId = self.__assignNewInstanceId(INSTANCE_ROLE_CMAGENT)
        dbNode.appendInstance(instId, MIRROR_ID_AGENT, INSTANCE_ROLE_CMAGENT,
                              INSTANCE_TYPE_UNDEFINED, agentIps[0], None, "")

    def __readGrConfig(self, dbNode):
        """
        function : Read oGRecorder config on node.
        input : []
        output : NA
        """
        # dbNode.grIp1 = self.compress_ips(self.__readNodeIps(dbNode.name, "grIp1"))
        # dbNode.listen_addr = self.compress_ips(self.__readNodeIps(dbNode.name, "listen_addr"))

    def __assignNewInstanceId(self, instRole):
        """
        function : Assign a new id for instance.
        input : String
        output : NA
        """
        newId = self.__newInstanceId[instRole]
        if (INSTANCE_ROLE_DATANODE == instRole):
            if (newId == OLD_LAST_PRIMARYSTANDBY_BASEID_NUM):
                self.__newInstanceId[instRole] = \
                    self.__newInstanceId[instRole] + 1 + \
                    (NEW_FIRST_PRIMARYSTANDBY_BASEID_NUM
                     - OLD_LAST_PRIMARYSTANDBY_BASEID_NUM)
            else:
                self.__newInstanceId[instRole] += 1
        else:
            self.__newInstanceId[instRole] += 1
        return newId

    def __assignNewMirrorId(self):
        """   
        function : Assign a new mirror id.
        input : NA
        output : NA
        """
        self.__newMirrorId += 1

        return self.__newMirrorId

    def __assignNewGroupId(self):
        """"""
        self.__newGroupId += 1
        return self.__newGroupId

    def __readNodeIps(self, nodeName, prefix):
        """  
        function : Read ip for node, such as backIp1, sshIp1 etc..
        input : String,String
        output : NA
        """
        ipList = []
        n = 1

        if (prefix == "cooListenIp"):
            n = 3
        elif (prefix == "etcdListenIp"):
            n = 2

        for i in range(1, CONFIG_IP_NUM + n):
            key = "%s%d" % (prefix, i)
            value = self.__readNodeStrValue(nodeName, key, True, "")
            if (value == ""):
                break
            ipList.append(value)

        return ipList

    def __readVirtualIp(self, nodeName, prefix):
        """
        function : Read  virtual ip only for node.
        input : String,String
        output : NA
        """
        ipList = []
        value = self.__readNodeStrValue(nodeName, prefix, True, "")
        if (value != ""):
            valueIps = value.split(",")
            for ip in valueIps:
                ip = ip.strip()
                if ip not in ipList:
                    ipList.append(ip)
        return self.compress_ips(ipList)

    def __isIpValid(self, ip):
        """  
        function : check if the input ip address is valid
        input : String
        output : NA
        """
        try:
            ipaddress.ip_address(ip)
            return True
        except ValueError:
            return False

    def __isPortValid(self, port):
        """   
        function :Judge if the port is valid
        input : int
        output : boolean
        """
        if (port < 0 or port > 65535):
            return False
        elif (port >= 0 and port <= 1023):
            return False
        else:
            return True

    def __readInstanceIps(self, nodeName, prefix, InstCount):
        """  
        function :Read instance ips
        input : String,String,int
        output : NA
        """
        multiIpList = self.__readNodeIps(nodeName, prefix)

        mutilIpCount = len(multiIpList)
        if (mutilIpCount == 0):
            return [[] for row in range(InstCount)]

        instanceIpList = [["" for col in range(mutilIpCount)] for row in
                          range(InstCount)]
        for i in range(mutilIpCount):
            ipList = []
            ipList_tmp = multiIpList[i].split(",")
            for ip in ipList_tmp:
                ipList.append(ip.strip())
            if prefix != "floatIpMap":
                ipList = self.compress_ips(ipList)
            ipNum = len(ipList)
            if (ipNum != InstCount):
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"]
                                % ("[%s] of node [%s]" % (prefix, nodeName))
                                + " The count of IP is wrong.")
            for j in range(ipNum):
                instanceIpList[j][i] = ipList[j]

        return instanceIpList

    def __readNodeIntValue(self, nodeName, key, nullable=False, defValue=0):
        """
        function :Read integer value of specified node
        input : String,int
        output : NA
        """
        value = defValue
        strValue = self.__readNodeStrValue(nodeName, key, nullable, "")
        if strValue is None or strValue == "":
            return value
        try:
            value = int(strValue)
        except Exception:
            value = defValue
        return value

    def __readNodeStrValue(self, nodeName, key, nullable=False, defValue=""):
        """
        function : Read string of specified node
        input : String,int
        output : defValue
        """
        (retStatus, retValue) = self.readOneClusterConfigItem(xmlRootNode, key, "node", nodeName)
        if retStatus == 0:
            return str(retValue).strip()
        elif retStatus == 2 and nullable:
            return defValue
        elif retStatus == 2 and ("dataNodeXlogPath" in key or "syncNum" in key or "syncNode" in key or "cmServerlevel" == key):
            return defValue
        else:
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                            ("[%s] of node [%s]" % (key, nodeName)) + \
                            " Return status: %d. value: %s. Check whether "
                            "the dataNum is correct first."
                            % (retStatus, retValue))

    def __checkAZForSingleInst(self):
        """
        function : check az names and DB replication
        input : NA
        output : NA
        """

        # Get DB standys num
        # The number of standbys for each DB instance must be the same
        peerNum = 0
        for dbNode in self.dbNodes:
            for inst in dbNode.datanodes:
                if (inst.instanceType == MASTER_INSTANCE):
                    peerInsts = self.getPeerInstance(inst)
                    if (peerNum == 0):
                        peerNum = len(peerInsts)
                    elif (peerNum != len(peerInsts)):
                        raise Exception(ErrorCode.GAUSS_532["GAUSS_53200"])

        if peerNum > 8:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51230"] % (
                "database node standbys", "be less than 9") + " Please set it.")

    def __getDNPeerInstance(self, dbInst):
        """  
        function : Get DB peer instance of specified instance when write
        static configuration file.
        input : []
        output : []
        """
        instances = []
        instIdLst = []

        for dbNode in self.dbNodes:
            for inst in dbNode.datanodes:
                if (inst.mirrorId == dbInst.mirrorId and inst.instanceId !=
                        dbInst.instanceId):
                    instances.append(inst)
                    instIdLst.append(inst.instanceId)

        # In a primary multi-standby cluster,
        # since the CM update system table depends on the DB read/write
        # sequence in the static configuration file,
        # we must sort the DN's standby list by instanceId.
        if dbInst.instanceType == MASTER_INSTANCE:
            instIdLst.sort()
            instanceLst = []
            for instId in instIdLst:
                for inst in instances:
                    if (inst.instanceId == instId):
                        instanceLst.append(inst)
            return instanceLst
        else:
            return instances

    def saveToStaticConfig(self, filePath, localNodeId, dbNodes=None,
                           upgrade=False):
        """ 
        function : Save cluster info into to static config 
        input : String,int
        output : NA
        """
        fp = None
        number = None
        if not self.dbNodes and dbNodes:
            self.dbNodes = dbNodes
        if upgrade:
            staticConfigFilePath = os.path.split(filePath)[0]
            versionFile = os.path.join(
                staticConfigFilePath, "upgrade_version")
            version, number, commitid = VersionInfo.get_version_info(
                versionFile)
        try:
            if (dbNodes is None):
                dbNodes = self.dbNodes
            createFileInSafeMode(filePath)
            fp = open(filePath, "wb")
            # len
            info = struct.pack("I", 28)
            # version
            info += struct.pack("I", BIN_CONFIG_VERSION_SINGLE_INST)
            # time
            info += struct.pack("q", int(time.time()))
            # node count
            info += struct.pack("I", len(dbNodes))
            # local node
            info += struct.pack("I", localNodeId)

            crc = binascii.crc32(info)
            if upgrade:
                if float(number) <= 92.200:
                    info = struct.pack("q", crc) + info
                else:
                    info = struct.pack("I", crc) + info
            else:
                info = struct.pack("I", crc) + info
            fp.write(info)

            for dbNode in dbNodes:
                offset = (fp.tell() // PAGE_SIZE + 1) * PAGE_SIZE
                fp.seek(offset)

                info = self.__packNodeInfo(dbNode, number, upgrade=upgrade)
                fp.write(info)
            endBytes = PAGE_SIZE - fp.tell() % PAGE_SIZE
            if (endBytes != PAGE_SIZE):
                info = struct.pack("%dx" % endBytes)
                fp.write(info)
            fp.flush()
            fp.close()
            os.chmod(filePath, DIRECTORY_PERMISSION)
        except Exception as e:
            if fp:
                fp.close()
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] % \
                            "static configuration file"
                            + " Error: \n%s" % str(e))

    def __packNodeInfo(self, dbNode, number, upgrade=False):
        """ 
        function : Pack the info of node 
        input : []
        output : String
        """
        # node id 
        info = struct.pack("I", dbNode.id)
        # node name
        info += struct.pack("64s", dbNode.name.encode("utf-8"))
        # az info
        info += struct.pack("64s", dbNode.azName.encode("utf-8"))
        info += struct.pack("I", dbNode.azPriority)
        # backIp
        info += self.__packIps(dbNode.backIps)
        # sshIp
        info += self.__packIps(dbNode.sshIps)
        # cm_server
        info += self.__packCmsInfo(dbNode)
        # cm_agent
        info += self.__packAgentInfo(dbNode)
        # gtm
        info += self.__packGtmInfo(dbNode)
        # cancel save gtmProxy info,need a placeholder
        info += self.__packGtmProxyInfo(dbNode)
        # cn
        info += self.__packCooInfo(dbNode)
        # dn
        info += self.__packDataNode(dbNode)
        # etcd
        info += self.__packEtcdInfo(dbNode)
        # cancel save sctp begin/end port,need a placeholder
        info += struct.pack("I", 0)
        info += struct.pack("I", 0)
        crc = binascii.crc32(info)

        if upgrade:
            if float(number) <= 92.200:
                return struct.pack("q", crc) + info
            else:
                return struct.pack("I", crc) + info
        else:
            return struct.pack("I", crc) + info

    def __packEtcdInfo(self, dbNode):
        """  
        function : Pack the info of etcd
        input : []
        output : String
        """
        n = len(dbNode.etcds)

        info = "".encode()
        if (n == 0):
            # etcd count
            info += struct.pack("I", 0)
            # etcd id
            info += struct.pack("I", 0)
            # etcd mirror id
            info += struct.pack("i", 0)
            # etcd name
            info += struct.pack("64x")
            # datadir
            info += struct.pack("1024x")
            # listen ip
            info += self.__packIps([])
            # listn port
            info += struct.pack("I", 0)
            # ha ip
            info += self.__packIps([])
            # ha port
            info += struct.pack("I", 0)
        elif (n == 1):
            etcdInst = dbNode.etcds[0]
            # etcd count
            info += struct.pack("I", 1)
            # etcd id
            info += struct.pack("I", etcdInst.instanceId)
            # etcd mirror id
            info += struct.pack("i", etcdInst.mirrorId)
            # etcd name
            info += struct.pack("64s", "etcd_%d".encode(
                "utf-8") % etcdInst.instanceId)
            # datadir
            info += struct.pack("1024s", etcdInst.datadir.encode("utf-8"))
            # listen ip
            info += self.__packIps(etcdInst.listenIps)
            # listn port
            info += struct.pack("I", etcdInst.port)
            # ha ip
            info += self.__packIps(etcdInst.haIps)
            # ha port
            info += struct.pack("I", etcdInst.haPort)
        else:
            pass

        return info

    def __packCmsInfo(self, dbNode):
        """ 
        function : Pack the info of cm server
        input : []
        output : String
        """
        n = len(dbNode.cmservers)

        info = "".encode()
        if (n == 0):
            # cm server id
            info += struct.pack("I", 0)
            # cm_server mirror id
            info += struct.pack("I", 0)
            # datadir
            info += struct.pack("1024s", dbNode.cmDataDir.encode("utf-8"))
            # cm server level
            info += struct.pack("I", 0)
            # float ip
            info += struct.pack("128x")
            # listen ip
            info += self.__packIps([])
            # listen port
            info += struct.pack("I", 0)
            # local ha ip
            info += self.__packIps([])
            # local ha port
            info += struct.pack("I", 0)
            # is primary
            info += struct.pack("I", 0)
            # peer ha ip
            info += self.__packIps([])
            # peer ha port
            info += struct.pack("I", 0)
        elif (n == 1):
            cmsInst = dbNode.cmservers[0]
            # cm server id
            info += struct.pack("I", cmsInst.instanceId)
            # cm_server mirror id
            info += struct.pack("I", cmsInst.mirrorId)
            # datadir
            info += struct.pack("1024s", dbNode.cmDataDir.encode("utf-8"))
            # cm server level
            info += struct.pack("I", cmsInst.level)
            info += struct.pack("128s", self.cmsFloatIp.encode("utf-8"))
            # listen ip
            info += self.__packIps(cmsInst.listenIps)
            # listen port
            info += struct.pack("I", cmsInst.port)
            # local ha ip
            info += self.__packIps(cmsInst.haIps)
            # local ha port
            info += struct.pack("I", cmsInst.haPort)
            # instance type
            info += struct.pack("I", cmsInst.instanceType)
            instances = self.getPeerInstance(cmsInst)
            peerInst = instances[0]
            # peer ha ip
            info += self.__packIps(peerInst.haIps)
            # peer ha port
            info += struct.pack("I", peerInst.haPort)
        else:
            pass

        return info

    def __packAgentInfo(self, dbNode):
        """ 
        function : Pack the info of agent
        input : []
        output : String
        """
        n = len(dbNode.cmagents)

        info = "".encode()
        if (n == 1):
            cmaInst = dbNode.cmagents[0]
            # Agent id
            info += struct.pack("I", cmaInst.instanceId)
            # Agent mirror id
            info += struct.pack("i", cmaInst.mirrorId)
            # agent ips
            info += self.__packIps(cmaInst.listenIps)

        return info

    def __packGtmInfo(self, dbNode):
        """ 
        function : Pack the info of gtm
        input : []
        output : String
        """
        n = len(dbNode.gtms)

        info = "".encode()
        if (n == 0):
            # gtm id
            info += struct.pack("I", 0)
            # gtm mirror id
            info += struct.pack("I", 0)
            # gtm count
            info += struct.pack("I", 0)
            # datadir
            info += struct.pack("1024x")
            # listen ip
            info += self.__packIps([])
            # listn port
            info += struct.pack("I", 0)
            #  instance type
            info += struct.pack("I", 0)
            # loacl ha ip
            info += self.__packIps([])
            # local ha port
            info += struct.pack("I", 0)
            # peer gtm datadir
            info += struct.pack("1024x")
            # peer ha ip
            info += self.__packIps([])
            # peer ha port
            info += struct.pack("I", 0)
        elif (n == 1):
            gtmInst = dbNode.gtms[0]
            # gtm id
            info += struct.pack("I", gtmInst.instanceId)
            # gtm mirror id
            info += struct.pack("I", gtmInst.mirrorId)
            # gtm count
            info += struct.pack("I", 1)
            # datadir
            info += struct.pack("1024s", gtmInst.datadir.encode("utf-8"))
            # listen ip
            info += self.__packIps(gtmInst.listenIps)
            # listn port
            info += struct.pack("I", gtmInst.port)
            #  instance type
            info += struct.pack("I", gtmInst.instanceType)
            # loacl ha ip
            info += self.__packIps(gtmInst.haIps)
            # local ha port
            info += struct.pack("I", gtmInst.haPort)
            # peer gtm datadir
            info += struct.pack("1024x")
            # peer ha ip
            info += self.__packIps([])
            # peer ha port
            info += struct.pack("I", 0)

        else:
            pass

        return info

    def __packGtmProxyInfo(self, dbNode):
        """  
        function : Pack the info of gtm proxy
        input : []
        output : String
        """
        info = "".encode()
        info += struct.pack("I", 0)
        info += struct.pack("I", 0)
        info += struct.pack("I", 0)
        info += self.__packIps([])
        info += struct.pack("I", 0)
        return info

    def __packCooInfo(self, dbNode):
        """  
        function : Pack the info of coordinator
        input : []
        output : String
        """
        n = len(dbNode.coordinators)

        info = "".encode()
        if (n == 0):
            # coordinator id
            info += struct.pack("I", 0)
            # coordinator mirror id
            info += struct.pack("i", 0)
            # coordinator count
            info += struct.pack("I", 0)
            # datadir
            info += struct.pack("1024x")
            # ssdDir
            info += struct.pack("1024x")
            # listen ip
            info += self.__packIps([])
            # listn port 
            info += struct.pack("I", 0)
            # ha port
            info += struct.pack("I", 0)
        elif (n == 1):
            cooInst = dbNode.coordinators[0]
            # coordinator id
            info += struct.pack("I", cooInst.instanceId)
            # coordinator mirror id
            info += struct.pack("i", cooInst.mirrorId)
            # coordinator count
            info += struct.pack("I", 1)
            # datadir
            info += struct.pack("1024s", cooInst.datadir.encode("utf-8"))
            # ssdDir
            info += struct.pack("1024s", cooInst.ssdDir.encode("utf-8"))
            # listen ip
            info += self.__packIps(cooInst.listenIps)
            # listn port
            info += struct.pack("I", cooInst.port)
            # ha port
            info += struct.pack("I", cooInst.haPort)
        else:
            pass

        return info

    def __packDataNode(self, dbNode):
        """   
        function : Pack the info of datanode
        input : []
        output : String
        """

        info = struct.pack("I", len(dbNode.datanodes))
        for dnInst in dbNode.datanodes:
            instances = self.__getDNPeerInstance(dnInst)
            # datanode id
            info += struct.pack("I", dnInst.instanceId)
            # datanode id
            info += struct.pack("I", dnInst.mirrorId)
            # datadir
            info += struct.pack("1024s", dnInst.datadir.encode("utf-8"))
            # xlogdir
            info += struct.pack("1024s", dnInst.xlogdir.encode("utf-8"))
            # ssdDir
            info += struct.pack("1024s", dnInst.ssdDir.encode("utf-8"))
            # listen ip
            info += self.__packIps(dnInst.listenIps)
            # port
            info += struct.pack("I", dnInst.port)
            # instance type
            info += struct.pack("I", dnInst.instanceType)
            # loacl ha ip
            info += self.__packIps(dnInst.haIps)
            # local ha port
            info += struct.pack("I", dnInst.haPort)

            maxStandbyCount = MIRROR_COUNT_REPLICATION_MAX - 1

            n = len(instances)
            for i in range(n):
                peerInst = instances[i]
                # peer1 datadir
                info += struct.pack("1024s", peerInst.datadir.encode("utf-8"))
                # peer1 ha ip
                info += self.__packIps(peerInst.haIps)
                # peer1 ha port
                info += struct.pack("I", peerInst.haPort)
                # instance type
                info += struct.pack("I", peerInst.instanceType)
            for i in range(n, maxStandbyCount):
                # peer1 datadir
                info += struct.pack("1024x")
                # peer1 ha ip
                info += self.__packIps([])
                # peer1 ha port
                info += struct.pack("I", 0)
                # instance type
                info += struct.pack("I", 0)
        return info

    def __packIps(self, ips):
        """
        function : Pack the info of ips
        input : []
        output : String
        """
        n = len(ips)

        info = struct.pack("I", n)
        for i in range(n):
            info += struct.pack("128s", ips[i].encode("utf-8"))
        for i in range(n, MAX_IP_NUM):
            info += struct.pack("128x")

        return info

    def isSingleInstCluster(self):
        return self.clusterType == CLUSTER_TYPE_SINGLE_INST

    def isSingleNode(self):
        return (self.__getDnInstanceNum() <= 1)

    def doRefreshConf(self, user, localHostName, sshtool, logger=None):
        self.__createDynamicConfig(user, localHostName, sshtool, logger)
        self.__create_simple_datanode_config(user, localHostName, sshtool)
        self.__reset_replconninfo(user, sshtool)

    def __createDynamicConfig(self, user, localHostName, sshtool, logger=None):
        """
        function : Save cluster info into to dynamic config
        input : String,int
        output : NA
        """
        # only one dn, no need to write primary or stanby node info
        dynamicConfigFile = self.__getDynamicConfig(user)
        if os.path.exists(dynamicConfigFile):
            cmd = "rm -f %s" % dynamicConfigFile
            (status, output) = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(ErrorCode.GAUSS_504["GAUSS_50407"] +
                                " Error: \n%s." % str(output) +
                                "The cmd is %s" % cmd)
        fp = None
        try:
            FileUtil.createFileInSafeMode(dynamicConfigFile)
            fp = open(dynamicConfigFile, "wb")
            # len
            info = struct.pack("I", 24)
            # version
            info += struct.pack("I", BIN_CONFIG_VERSION_SINGLE_INST)
            # time
            info += struct.pack("q", int(time.time()))
            # node count
            info += struct.pack("I", len(self.dbNodes))
            crc = binascii.crc32(info)
            info = struct.pack("I", crc) + info
            fp.write(info)
            primaryDnNum = 0
            for dbNode in self.dbNodes:
                offset = (fp.tell() // PAGE_SIZE + 1) * PAGE_SIZE
                fp.seek(offset)
                (primaryNodeNum, info) = self.__packDynamicNodeInfo(
                    dbNode, localHostName, sshtool, logger)
                primaryDnNum += primaryNodeNum
                fp.write(info)
            if primaryDnNum != 1:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51230"] %
                                ("master dn", "equal to 1"))
            endBytes = PAGE_SIZE - fp.tell() % PAGE_SIZE
            if endBytes != PAGE_SIZE:
                info = struct.pack("%dx" % endBytes)
                fp.write(info)
            fp.flush()
            fp.close()
            os.chmod(dynamicConfigFile, ConstantsBase.KEY_FILE_PERMISSION)
        except Exception as e:
            if fp:
                fp.close()
            cmd = "rm -f %s" % dynamicConfigFile
            subprocess.getstatusoutput(cmd)
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] % \
                            "dynamic configuration file"
                            + " Error: \n%s" % str(e))
        try:
            self.__sendDynamicCfgToAllNodes(localHostName,
                                            dynamicConfigFile,
                                            dynamicConfigFile)
        except Exception as e:
            cmd = "rm -f %s" % dynamicConfigFile
            sshtool.getSshStatusOutput(cmd, self.getClusterNodeNames())
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] % \
                            "dynamic configuration file" +
                            " Error: \n%s" % str(e))

    def __create_simple_datanode_config(self, user, localhostname, sshtool):
        simpleDNConfig = self.__getDynamicSimpleDNConfig(user)
        if os.path.exists(simpleDNConfig):
            cmd = "rm -f %s" % simpleDNConfig
            (status, output) = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(ErrorCode.GAUSS_504["GAUSS_50407"] +
                                " Error: \n%s." % str(output) +
                                "The cmd is %s" % cmd)
        output_list = self.__getStatusByOM(user)
        output_num = 0
        # The purpose of this regular expression is to match text lines containing IPv4 or IPv6 addresses.
        pattern = re.compile(r'(\d+) (.*) ((?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})|(?:[0-9a-fA-F]{0,4}(?::[0-9a-fA-F]{0,4})*::?(?:[0-9a-fA-F]{0,4})?)) (.*)')
        if not self.hasNoCm():
            output_list = [i for i in output_list if i]
            output_list = output_list[-1].split('|')
        for contont in output_list:
            if pattern.search(contont):
                output_num += 1
        tempstatus = output_list[-output_num:]
        statusdic = {'Primary': 0, 'Standby': 1, 'Cascade': 3, 'Unknown': 9}
        try:
            with open(simpleDNConfig, "w") as fp:
                for dninfo in tempstatus:
                    dnstatus = dninfo.split()[7]
                    dnname = dninfo.split()[1]
                    if dnstatus not in statusdic:
                        fp.write("%s=%d\n" %
                                 (dnname, statusdic['Unknown']))
                    else:
                        fp.write("%s=%d\n" %
                                 (dnname, statusdic[dnstatus]))
        except Exception as e:
            cmd = "rm -f %s" % simpleDNConfig
            subprocess.getstatusoutput(cmd)
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] %
                            "dynamic configuration file"
                            + " Error: \n%s" % str(e))
        try:
            self.__sendDynamicCfgToAllNodes(localhostname,
                                            simpleDNConfig,
                                            simpleDNConfig)
        except Exception as e:
            cmd = "rm -f %s" % simpleDNConfig
            sshtool.getSshStatusOutput(cmd, self.getClusterNodeNames())
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50205"] %
                            "dynamic configuration file" +
                            " Error: \n%s" % str(e))

    def __reset_replconninfo(self, user, sshtool):
        # add for cascade
        local_script = os.path.dirname(os.path.realpath(__file__)) \
                       + '/../../local/Resetreplconninfo.py'
        cmd = "python3 %s -U %s -t reset" % (local_script, user)
        sshtool.setTimeOut(120)
        for node in self.getClusterNodeNames():
            (status, output) = sshtool.getSshStatusOutput(cmd, [node])
            if status[node] != 'Success':
                raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"]
                                % cmd + "Error:\n%s" % output)

    def __packDynamicNodeInfo(self, dbNode, localHostName, sshtool, logger=None):
        # node id
        info = struct.pack("I", dbNode.id)
        # node name
        info += struct.pack("64s", dbNode.name.encode("utf-8"))
        info += struct.pack("I", len(dbNode.datanodes))
        primaryNum = 0
        for dnInst in dbNode.datanodes:
            self.__getDnState(dnInst, dbNode, localHostName, sshtool, logger)
            instanceType = 0
            if dnInst.localRole == "Primary":
                instanceType = MASTER_INSTANCE
                primaryNum += 1
            elif dnInst.localRole == "Cascade Standby":
                instanceType = CASCADE_STANDBY
            else:
                instanceType = STANDBY_INSTANCE
            if logger:
                logger.debug(f"""instanceInfo: name: {dnInst.hostname}, \
                         role: {dnInst.localRole}, \
                         state: {dnInst.state}""")
            info += struct.pack("I", dnInst.instanceId)
            # datanode id
            info += struct.pack("I", dnInst.mirrorId)
            # instanceType such as master, standby, dumpstandby
            info += struct.pack("I", instanceType)
            # datadir
            info += struct.pack("1024s", dnInst.datadir.encode("utf-8"))
        info += struct.pack("I", 0)
        info += struct.pack("I", 0)
        crc = binascii.crc32(info)
        return (primaryNum, struct.pack("I", crc) + info)

    def __getClusterSwitchTime(self, dynamicConfigFile):
        """
        function : get cluster version information
                   from static configuration file
        input : String
        output : version
        """
        fp = None
        try:
            fp = open(dynamicConfigFile, "rb")
            info = fp.read(24)
            (crc, lenth, version, switchTime, nodeNum) = \
                struct.unpack("=IIIqi", info)
            fp.close()
        except Exception as e:
            if fp:
                fp.close()
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"]
                            + " Error: \n%s." % str(e))
        return switchTime

    def __getDynamicConfig(self, user):
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if gaussHome == "":
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]"
                             % user))
        # if under upgrade, and use chose strategy, we may get a wrong path,
        # so we will use the realpath of gausshome
        gaussHome = os.path.realpath(gaussHome)
        dynamicConfigFile = "%s/bin/cluster_dynamic_config" % gaussHome
        return dynamicConfigFile
    def __getDynamicSimpleDNConfig(self, user):
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if gaussHome == "":
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]"
                             % user))
        # if under upgrade, and use chose strategy, we may get a wrong path,
        # so we will use the realpath of gausshome
        gaussHome = os.path.realpath(gaussHome)
        dynamicSimpleDNConfigFile = "%s/bin/cluster_dnrole_config" % gaussHome
        return dynamicSimpleDNConfigFile

    def dynamicConfigExists(self, user):
        dynamicConfigFile = self.__getDynamicConfig(user)
        return os.path.exists(dynamicConfigFile)

    def checkClusterDynamicConfig(self, user, localHostName):
        """
        function : make all the node dynamic config file is newest.
        input : String
        output : none
        """
        if self.__getDnInstanceNum() <= 1:
            return
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if gaussHome == "":
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]"
                             % user))
        # if under upgrade, and use chose strategy, we may get a wrong path,
        # so we will use the realpath of gausshome
        gaussHome = os.path.realpath(gaussHome)
        dynamicConfigFile = "%s/bin/cluster_dynamic_config" % gaussHome
        lastSwitchTime = 0
        lastDynamicConfigFile = ""
        fileConsistent = False
        fileExist = False
        if os.path.exists(dynamicConfigFile):
            lastSwitchTime = self.__getClusterSwitchTime(dynamicConfigFile)
            lastDynamicConfigFile = dynamicConfigFile
            fileExist = True
            fileConsistent = True
        for dbNode in self.dbNodes:
            remoteDynamicConfigFile = "%s/bin/cluster_dynamic_config_%s" \
                                      % (gaussHome, dbNode.name)
            if dbNode.name != localHostName:
                node_ip = dbNode.sshIps[0]
                if get_ip_version(node_ip) == NET_IPV6:
                    # scp file is to the ipv6 address, needs to add [] to ipaddress:
                    # scp a.txt [2407:c080:1200:22a0:613f:8d3b:caa:2335]:/data
                    node_ip = "[" + node_ip + "]"
                cmd = "export LD_LIBRARY_PATH=/usr/lib64;/usr/bin/scp %s:%s %s" % (
                    node_ip, dynamicConfigFile, remoteDynamicConfigFile)
                status, output = subprocess.getstatusoutput(cmd)
                if status:
                    if output.find("No such file or directory") >= 0:
                        fileConsistent = False
                        continue
                    raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd
                                    + " Error:\n" + output)
                if os.path.exists(remoteDynamicConfigFile):
                    fileExist = True
                    switchTime = self.__getClusterSwitchTime(
                        remoteDynamicConfigFile)
                    if switchTime > lastSwitchTime:
                        lastSwitchTime = switchTime
                        lastDynamicConfigFile = remoteDynamicConfigFile
                        fileConsistent = False
                    elif switchTime < lastSwitchTime:
                        fileConsistent = False
        # if dynamic config file exist, but file time is not same,
        # send the valid file to all nodes
        if fileExist:
            if not fileConsistent:
                self.__sendDynamicCfgToAllNodes(localHostName,
                                                lastDynamicConfigFile,
                                                dynamicConfigFile)
            cleanCmd = "rm -f %s/bin/cluster_dynamic_config_*" % gaussHome
            subprocess.getstatusoutput(cleanCmd)

    def __sendDynamicCfgToAllNodes(self,
                                   localHostName,
                                   sourceFile,
                                   targetFile):
        status = 0
        output = ""
        for dbNode in self.dbNodes:
            if dbNode.name == localHostName:
                if sourceFile != targetFile:
                    cmd = "cp -f  %s %s" % (sourceFile, targetFile)
                    status, output = subprocess.getstatusoutput(cmd)
            else:
                node = self.getDbNodeByName(dbNode.name)
                node_ip = node.sshIps[0]
                if get_ip_version(node_ip) == NET_IPV6:
                    # scp file is to the ipv6 address, needs to add [] to ipaddress:
                    # scp a.txt [2407:c080:1200:22a0:613f:8d3b:caa:2335]:/data
                    node_ip = "[" + node_ip + "]"
                cmd = "export LD_LIBRARY_PATH=/usr/lib64;/usr/bin/scp %s %s:%s" % (sourceFile, node_ip, targetFile)
                status, output = subprocess.getstatusoutput(cmd)
            if status:
                raise Exception(ErrorCode.GAUSS_514["GAUSS_51400"] % cmd +
                                " Error:\n" + output)

    def readDynamicConfig(self, user):
        """
        function : read cluster information from dynamic configuration file
                   only used for start cluster after switchover
        input : String
        output : NA
        """
        fp = None
        try:
            self.name = self.__getEnvironmentParameterValue("GS_CLUSTER_NAME",
                                                            user)
            self.appPath = self.__getEnvironmentParameterValue("GAUSSHOME",
                                                               user)
            logPathWithUser = self.__getEnvironmentParameterValue("GAUSSLOG",
                                                                  user)
            splitMark = "/%s" % user
            # set log path without user
            # find the path from right to left
            self.logPath = \
                logPathWithUser[0:(logPathWithUser.rfind(splitMark))]
            dynamicConfigFile = self.__getDynamicConfig(user)
            # read dynamic_config_file
            dynamicConfigFilePath = os.path.split(dynamicConfigFile)[0]
            versionFile = os.path.join(
                dynamicConfigFilePath, "upgrade_version")
            version, number, commitid = VersionInfo.get_version_info(
                versionFile)
            fp = open(dynamicConfigFile, "rb")
            if float(number) <= 92.200:
                info = fp.read(28)
                (crc, lenth, version, currenttime, nodeNum) = \
                    struct.unpack("=qIIqi", info)
            else:
                info = fp.read(24)
                (crc, lenth, version, currenttime, nodeNum) = \
                    struct.unpack("=IIIqi", info)
            totalMaterDnNum = 0
            for i in range(nodeNum):
                offset = (fp.tell() // PAGE_SIZE + 1) * PAGE_SIZE
                fp.seek(offset)
                (dbNode, materDnNum) = self.__unpackDynamicNodeInfo(fp, number)
                totalMaterDnNum += materDnNum
                self.dbNodes.append(dbNode)
            if totalMaterDnNum != 1:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51230"] %
                                ("master dn", "1"))
            fp.close()
        except Exception as e:
            if fp:
                fp.close()
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] %
                            dynamicConfigFile + " Error:\n" + str(e))

    def __unpackDynamicNodeInfo(self, fp, number):
        if float(number) <= 92.200:
            info = fp.read(76)
            (crc, nodeId, nodeName) = struct.unpack("=qI64s", info)
        else:
            info = fp.read(72)
            (crc, nodeId, nodeName) = struct.unpack("=II64s", info)
        nodeName = nodeName.decode().strip('\x00')
        dbNode = dbNodeInfo(nodeId, nodeName)
        info = fp.read(4)
        (dataNodeNums,) = struct.unpack("=I", info)
        dbNode.datanodes = []
        materDnNum = 0
        for i in range(dataNodeNums):
            dnInst = instanceInfo()
            dnInst.hostname = nodeName
            info = fp.read(12)
            (dnInst.instanceId, dnInst.mirrorId, dnInst.instanceType) = \
                struct.unpack("=III", info)
            if dnInst.instanceType == MASTER_INSTANCE:
                materDnNum += 1
            elif dnInst.instanceType not in [STANDBY_INSTANCE,
                                             DUMMY_STANDBY_INSTANCE, CASCADE_STANDBY]:
                raise Exception(ErrorCode.GAUSS_512["GAUSS_51204"] %
                                ("DN", dnInst.instanceType))
            info = fp.read(1024)
            (datadir,) = struct.unpack("=1024s", info)
            dnInst.datadir = datadir.decode().strip('\x00')
            dbNode.datanodes.append(dnInst)
        return (dbNode, materDnNum)

    def hasNoCm(self):
        """
        function:check whether cm exist
        :return:True or False
        """
        return self.cmscount < 1

    def getDbNodeByID(self, inputid):
        """
        function : Get node by id.
        input : nodename
        output : []
        """
        for dbNode in self.dbNodes:
            if dbNode.id == inputid:
                return dbNode
        return None

    def __read_cluster_float_ips(self, dn_float_ips):
        """
        Read cluster global info(float IP) to dbClusterInfo
        """
        for ips_tmp in dn_float_ips:
            for res_name in ips_tmp:
                if res_name not in self.float_ips:
                    ret_status, ret_value = self.readOneClusterConfigItem(
                                            xmlRootNode, res_name, "CLUSTER")
                    if ret_status == 0:
                        self.float_ips[res_name] = ret_value.strip()
                    else:
                        raise Exception(ErrorCode.GAUSS_502["GAUSS_50204"] % \
                                        "float IP." + " Error: \n%s" % ret_value)

    def printStaticConfig(self, xmlFile, fileName=""):
        """
        function : printStaticConfig
        input : String
        output : NA
        """
        clusterInfo = dbClusterInfo()
        clusterInfo.initFromXml(xmlFile)
        try:
            # read static_config_file
            outText = "NodeHeader:\n"
            outText = outText + ("version:%u\n" % clusterInfo.version)
            outText = outText + ("time:%ld\n" % clusterInfo.installTime)
            outText = outText + ("nodeCount:%u\n" % clusterInfo.nodeCount)
            outText = outText + ("node:%u\n" % clusterInfo.localNodeId)
            outText = outText + ("=" * 60 + "\n")
            dnTotalNum = self.__getDnInstanceNum()
            for dbNode in self.dbNodes:
                outText = outText + ("azName:%s\n" % dbNode.azName)
                outText = outText + ("azPriority:%u\n" % dbNode.azPriority)
                outText = outText + ("node :%u\n" % dbNode.id)
                outText = outText + ("nodeName:%s\n" % dbNode.name)

                outText = outText + "ssh channel :\n"
                j = 0
                for sshIp in dbNode.sshIps:
                    outText = outText + ("sshChannel %u:%s\n" % (
                        j + 1, dbNode.sshIps[j]))
                    j = j + 1
                outText = outText + (
                        "datanodeCount :%u\n" % len(dbNode.datanodes))
                j = 0
                for dnInst in dbNode.datanodes:
                    j = j + 1
                    outText = outText + ("datanodeInstanceType :%s\n" %
                                         DICT_INSTANCE[dnInst.instanceType])
                    outText = outText + ("datanode %u:\n" % j)
                    outText = outText + (
                            "datanodeLocalDataPath :%s\n" % dnInst.datadir)
                    outText = outText + (
                            "datanodeXlogPath :%s\n" % dnInst.xlogdir)
                    k = 0
                    for listenIp in dnInst.listenIps:
                        k = k + 1
                        outText = outText + (
                                "datanodeListenIP %u:%s\n" % (k, listenIp))
                    outText = outText + ("datanodePort :%u\n" % dnInst.port)
                    k = 0
                    for haIp in dnInst.haIps:
                        k = k + 1
                        outText = outText + (
                                "datanodeLocalHAIP %u:%s\n" % (k, haIp))
                    outText = outText + (
                            "datanodeLocalHAPort :%u\n" % dnInst.haPort)
                    outText = outText + (
                            "dn_replication_num: %u\n" % dnTotalNum)
                    maxPeerNum = MIRROR_COUNT_REPLICATION_MAX if \
                        self.nodeCount > MIRROR_COUNT_REPLICATION_MAX \
                        else self.nodeCount
                    for k in range(maxPeerNum - 1):
                        outText = outText + ("datanodePeer%uDataPath :%s\n" %
                                             (k, dnInst.peerInstanceInfos[k].peerDataPath))
                        m = 0
                        for peerHaIP in dnInst.peerInstanceInfos[k].peerHAIPs:
                            m += 1
                            outText = outText + ("datanodePeer%uHAIP %u:%s\n" % (k, m, peerHaIP))
                        outText = outText + ("datanodePeer%uHAPort :%u\n" %
                                             (k, dnInst.peerInstanceInfos[k].peerHAPort))

                    outText = outText + ("=" * 60 + "\n")
            self.__fprintContent(outText, fileName)
        except Exception as e:
            raise Exception(ErrorCode.GAUSS_516["GAUSS_51652"] % str(e))
    
    def doRebuildConf(self, xmlFile):
        """
        generating static configuration files for all nodes
        input:NA
        output:NA
        """
        try:
            tmpDirName = ""

            clusterInfo = dbClusterInfo()
            clusterInfo.initFromXml(xmlFile)

            dirName = os.path.dirname(os.path.realpath(__file__))
            tmpDirName = os.path.realpath("%s/static_config_files" % dirName)
            cmd = "mkdir -p -m %s '%s'" % (
                KEY_DIRECTORY_MODE, tmpDirName)
            (status, output) = subprocess.getstatusoutput(cmd)
            if (status != 0):
                raise Exception(
                    ErrorCode.GAUSS_502["GAUSS_50208"]
                    % "temporary directory" + "\nCommand:%s\nError: %s"
                    % (cmd, output))

            for dbNode in self.dbNodes:
                staticConfigPath = "%s/cluster_static_config_%s" % (
                    tmpDirName, dbNode.name)
                clusterInfo.saveToStaticConfig(staticConfigPath,
                                                            dbNode.id)

            for dbNode in clusterInfo.dbNodes:
                if (dbNode.name != GetHostIpOrName()):
                    cmd = 'ssh -q -o ConnectTimeout=5 %s mkdir -p %s/bin' % (dbNode.sshIps[0], clusterInfo.appPath)
                    (status, output) = subprocess.getstatusoutput(cmd)
                    if status != 0:
                        print("ERROR")
                    cmd = \
                        "scp %s/cluster_static_config_%s %s:%s/bin/cluster_static_config" % (
                            tmpDirName,
                            dbNode.name, dbNode.sshIps[0], clusterInfo.appPath)
                else:
                    cmd = \
                        "mkdir -p %s/bin; cp %s/cluster_static_config_%s %s" \
                        "/bin/cluster_static_config" % (
                            clusterInfo.appPath, tmpDirName,
                            dbNode.name, clusterInfo.appPath)
                (status, output) = subprocess.getstatusoutput(cmd)
                if status != 0:
                    raise Exception(
                        ErrorCode.GAUSS_502["GAUSS_50216"]
                        % "static configuration file"
                        + "Node: %s.\nCommand: \n%s\nError: \n%s"
                        % (dbNode.name, cmd, output))

        except Exception as e:
            removeDirectory(tmpDirName)
            raise Exception(str(e))

    def execute_on_node(self, node, cmd, local_name=None, ssh_ip=None):
        """
        在指定节点上执行命令
        
        Args:
            node: 节点名
            cmd: 要执行的命令(不带ssh前缀)
            local_name: 本地主机名或IP
            ssh_ip: 远程ssh IP(如有)
        
        Returns:
            (status, output): 命令执行状态码和输出
        """
        if node == local_name or node == GetHostIpOrName():
            return subprocess.getstatusoutput(cmd)
        else:
            ssh_target = ssh_ip if ssh_ip else node
            ssh_cmd = f"ssh -q -o ConnectTimeout=5 {ssh_target} '{cmd}'"
            return subprocess.getstatusoutput(ssh_cmd)

    def distribute_file(self, source_path, dest_path, nodes, set_executable=False, recursive=False):
        """
        分发文件或目录到多个节点
        
        Args:
            source_path: 源文件/目录路径(本地)
            dest_path: 目标路径
            nodes: 节点列表(dbNode 对象列表)
            set_executable: 是否设置可执行权限
            recursive: 是否递归复制目录
        
        Returns:
            成功分发的节点数
        
        Raises:
            Exception: 如果任意节点分发失败
        """
        success_count = 0
        local_name = GetHostIpOrName()
        r_flag = "-r" if recursive else ""
        
        for dbNode in nodes:
            if dbNode.name != local_name:
                # 远程节点:使用 scp
                cmd = f"scp -q {r_flag} {source_path} {dbNode.sshIps[0]}:{dest_path}"
                status, output = subprocess.getstatusoutput(cmd)
                if status != 0:
                    raise Exception(f"Failed to distribute {source_path} to {dbNode.name}: {output}")
                
                if set_executable:
                    # 设置可执行权限
                    target_file = os.path.join(dest_path, os.path.basename(source_path)) if not source_path.endswith('*') else f"{dest_path}/*"
                    cmd = f"ssh -q -o ConnectTimeout=5 {dbNode.sshIps[0]} 'chmod +x {target_file}'"
                    subprocess.getstatusoutput(cmd)
            else:
                # 本地节点:使用 cp
                cmd = f"cp {r_flag} {source_path} {dest_path}"
                status, output = subprocess.getstatusoutput(cmd)
                if status != 0:
                    raise Exception(f"Failed to copy {source_path} to {dest_path}: {output}")
                
                if set_executable:
                    if source_path.endswith('*'):
                        cmd = f"chmod +x {dest_path}/*"
                        subprocess.getstatusoutput(cmd)
                    else:
                        target_file = os.path.join(dest_path, os.path.basename(source_path)) if os.path.isdir(dest_path) else dest_path
                        os.chmod(target_file, 0o755)
            
            success_count += 1
        
        return success_count

    def _verify_package_type(self, pkg_path, expected_type):
        """
        验证安装包的类型是否与预期匹配
        
        通过解压包并检查特征文件来判断包类型:
        - GR包: 应包含 bin/grcmd, bin/grserver 等 GR 相关二进制
        - CM包: 应包含 bin/cm_server, bin/cm_agent 等 CM 相关二进制
        
        Args:
            pkg_path: 安装包路径
            expected_type: 预期的包类型 ('gr' 或 'cm')
        
        Returns:
            True 如果验证通过
        
        Raises:
            PackageVerificationError 如果包类型不匹配或包损坏
        """
        import tempfile
        import shutil
        
        # 检查包文件是否存在
        if not os.path.exists(pkg_path):
            raise PackageVerificationError(
                message=f"Package file not found: {pkg_path}",
                error_code=PackageVerificationError.ERR_PKG_NOT_FOUND,
                pkg_path=pkg_path,
                expected_type=expected_type
            )
        
        # GR包特征文件(必须包含至少一个)
        GR_SIGNATURE_FILES = [
            'bin/grcmd',
            'bin/grserver',
            'bin/gr_ctl',
            'lib/libgr.so',
        ]
        
        # CM包特征文件(必须包含至少一个)
        CM_SIGNATURE_FILES = [
            'bin/cm_server',
            'bin/cm_agent',
            'bin/cm_ctl',
            'lib/libcm_common.so',
        ]
        
        # 创建临时目录解压包
        temp_dir = tempfile.mkdtemp(prefix='pkg_verify_')
        try:
            # 解压包
            cmd = f"tar -zxf {pkg_path} -C {temp_dir} 2>&1"
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise PackageVerificationError(
                    message=f"Failed to extract package: {output}",
                    error_code=PackageVerificationError.ERR_PKG_EXTRACT_FAILED,
                    pkg_path=pkg_path,
                    expected_type=expected_type
                )
            
            # 检查包内容
            has_gr_signature = False
            has_cm_signature = False
            gr_found_files = []
            cm_found_files = []
            
            # 检查 GR 特征文件
            for sig_file in GR_SIGNATURE_FILES:
                check_path = os.path.join(temp_dir, sig_file)
                if os.path.exists(check_path):
                    has_gr_signature = True
                    gr_found_files.append(sig_file)
            
            # 检查 CM 特征文件
            for sig_file in CM_SIGNATURE_FILES:
                check_path = os.path.join(temp_dir, sig_file)
                if os.path.exists(check_path):
                    has_cm_signature = True
                    cm_found_files.append(sig_file)
            
            # 获取包内顶层文件列表(用于错误提示)
            cmd = f"ls -la {temp_dir}"
            _, content_list = subprocess.getstatusoutput(cmd)
            
            # 验证包类型
            pkg_name = os.path.basename(pkg_path)
            
            if expected_type == 'gr':
                if has_cm_signature and not has_gr_signature:
                    raise PackageVerificationError(
                        message=f"Package type mismatch! Expected GR package but found CM package.\n"
                                f"  CM signature files found: {', '.join(cm_found_files)}",
                        error_code=PackageVerificationError.ERR_PKG_TYPE_MISMATCH,
                        pkg_path=pkg_path,
                        expected_type='gr',
                        detected_type='cm'
                    )
                if not has_gr_signature:
                    raise PackageVerificationError(
                        message=f"Invalid GR package! Package does not contain GR signature files.\n"
                                f"  Expected files: {', '.join(GR_SIGNATURE_FILES)}\n"
                                f"  Package contents:\n{content_list}",
                        error_code=PackageVerificationError.ERR_PKG_INVALID_GR,
                        pkg_path=pkg_path,
                        expected_type='gr'
                    )
            elif expected_type == 'cm':
                if has_gr_signature and not has_cm_signature:
                    raise PackageVerificationError(
                        message=f"Package type mismatch! Expected CM package but found GR package.\n"
                                f"  GR signature files found: {', '.join(gr_found_files)}",
                        error_code=PackageVerificationError.ERR_PKG_TYPE_MISMATCH,
                        pkg_path=pkg_path,
                        expected_type='cm',
                        detected_type='gr'
                    )
                if not has_cm_signature:
                    raise PackageVerificationError(
                        message=f"Invalid CM package! Package does not contain CM signature files.\n"
                                f"  Expected files: {', '.join(CM_SIGNATURE_FILES)}\n"
                                f"  Package contents:\n{content_list}",
                        error_code=PackageVerificationError.ERR_PKG_INVALID_CM,
                        pkg_path=pkg_path,
                        expected_type='cm'
                    )
            else:
                raise PackageVerificationError(
                    message=f"Unknown package type: {expected_type}",
                    error_code=PackageVerificationError.ERR_PKG_UNKNOWN_TYPE,
                    pkg_path=pkg_path,
                    expected_type=expected_type
                )
            
            return True
            
        finally:
            # 清理临时目录
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir, ignore_errors=True)

    def _check_gr_status(self, user, nodes_to_check, check_phase="pre-upgrade"):
        """
        检查每个节点的 GR 状态,确保 grcmd getstatus 命令执行成功且状态为 open
        
        Args:
            user: 当前用户
            nodes_to_check: 要检查的节点列表
            check_phase: 检查阶段描述 ("pre-upgrade" 或 "post-upgrade")
        
        Returns:
            True 如果所有节点检查通过
        
        Raises:
            Exception 如果任一节点命令执行失败或状态不是 open
        """
        self.logger.log(f"Checking GR status on all nodes ({check_phase})...")
        
        # 获取环境变量文件路径
        envfile_path = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        env_source = f"source {envfile_path} && " if envfile_path else ""
        
        # 在每个节点上执行检查
        for dbNode in nodes_to_check:
            self.logger.log(f"Checking GR status on node {dbNode.name} ({check_phase})...")
            
            # 执行 grcmd getstatus 命令
            cmd = f"{env_source}grcmd getstatus"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            
            if status != 0:
                raise Exception(f"GR status check failed on node {dbNode.name} ({check_phase}): grcmd getstatus command failed with status {status}. Output: {output}")
            
            self.logger.log(f"Node {dbNode.name} grcmd getstatus output:\n{output}")
            
            # 检查输出中是否包含 "open" 状态
            # 输出格式示例: "Server status of instance 1 is open and READONLY."
            if "is open" not in output.lower():
                raise Exception(f"GR status check failed on node {dbNode.name} ({check_phase}): Server status is not 'open'. Output: {output}")
            
            self.logger.log(f"GR status check passed on node {dbNode.name} ({check_phase}): Server status is open.")
        
        self.logger.log(f"All nodes passed GR status check ({check_phase}).")
        return True

    def _check_cm_status(self, user, check_phase="pre-upgrade"):
        """
        检查 CM 集群状态,确保 cm_ctl query -Cv 命令执行成功且集群状态正常
        
        Args:
            user: 当前用户
            check_phase: 检查阶段描述 ("pre-upgrade" 或 "post-upgrade")
        
        Returns:
            True 如果检查通过
        
        Raises:
            Exception 如果命令执行失败或集群状态不正常
        """
        self.logger.log(f"Checking CM cluster status ({check_phase})...")
        
        # 获取环境变量文件路径
        envfile_path = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        env_source = f"source {envfile_path} && " if envfile_path else ""
        
        # 执行 cm_ctl query -Cv 命令
        cmd = f"{env_source}cm_ctl query -Cv"
        status, output = subprocess.getstatusoutput(cmd)
        
        if status != 0:
            raise Exception(f"CM status check failed ({check_phase}): cm_ctl query -Cv command failed with status {status}. Output: {output}")
        
        self.logger.log(f"cm_ctl query -Cv output:\n{output}")
        
        # 检查 CMServer State 部分,确保所有节点的 CMServer 都有状态(Primary 或 Standby)
        # 输出格式示例:
        # [  CMServer State   ]
        # node           instance state
        # -------------------------------
        # 1  openGauss56 1        Primary
        # 2  openGauss54 2        Standby
        
        output_lower = output.lower()
        
        # 检查是否有 CMServer State 部分
        if "cmserver state" not in output_lower:
            raise Exception(f"CM status check failed ({check_phase}): CMServer State section not found in output. Output: {output}")
        
        # 检查集群状态是否为 Normal
        # 输出格式示例: cluster_state   : Normal
        if "cluster_state" not in output_lower:
            raise Exception(f"CM status check failed ({check_phase}): cluster_state not found in output. Output: {output}")
        
        # 提取 cluster_state 的值
        cluster_state_match = re.search(r'cluster_state\s*:\s*(\w+)', output, re.IGNORECASE)
        if not cluster_state_match:
            raise Exception(f"CM status check failed ({check_phase}): Unable to parse cluster_state. Output: {output}")
        
        cluster_state = cluster_state_match.group(1)
        if cluster_state.lower() != "normal":
            raise Exception(f"CM status check failed ({check_phase}): cluster_state is '{cluster_state}', expected 'Normal'. Output: {output}")
        
        # 检查 CMServer 是否都有 Primary 或 Standby 状态
        # 确保至少有一个 Primary
        if "primary" not in output_lower:
            raise Exception(f"CM status check failed ({check_phase}): No Primary CMServer found. Output: {output}")
        
        self.logger.log(f"CM status check passed ({check_phase}): cluster_state is Normal, CMServer is running.")
        return True

    def gr_install(self, xmlFile, PkgPath, RestPkgPath, jsonFile=None):
        user = getpass.getuser()
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if (gaussHome == ""):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]" % user))
        clusterInfo = dbClusterInfo()
        clusterInfo.initFromXml(xmlFile)
        # 初始化日志模块
        self.initLogger("install")

        self.logger.log("Start install oGRecorder.")
        
        # 备份 JSON 配置文件到 GR_HOME 目录
        if jsonFile and os.path.exists(jsonFile):
            backup_dir = os.path.join(clusterInfo.grPath, "cfg")
            backup_file = os.path.join(backup_dir, "cluster_config.json")
            self.logger.log(f"Backing up JSON config file to {backup_file}")
            for dbNode in clusterInfo.dbNodes:
                # 确保备份目录存在
                cmd = f"mkdir -p {backup_dir}"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0:
                    self.logger.warn(f"Failed to create backup directory on {dbNode.name}: {output}")
                    continue
                
                # 复制 JSON 文件到备份目录
                if dbNode.name != GetHostIpOrName():
                    cmd = f"scp {jsonFile} {dbNode.sshIps[0]}:{backup_file}"
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = f"cp {jsonFile} {backup_file}"
                    status, output = subprocess.getstatusoutput(cmd)
                
                if status != 0:
                    self.logger.warn(f"Failed to backup JSON config to {dbNode.name}: {output}")
                else:
                    self.logger.log(f"Successfully backed up JSON config to {dbNode.name}:{backup_file}")
        pkgName = os.path.basename(PkgPath)
        cfg_path = os.path.join(clusterInfo.grPath, "cfg", "gr_inst.ini")
        parent_dir = os.path.dirname(cfg_path)
        # 创建目录
        for dbNode in clusterInfo.dbNodes:
            cmd = "mkdir -p %s/script %s/data %s" % (clusterInfo.toolPath, clusterInfo.grPath, parent_dir)
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"Failed to create install path, Error output:\n{output}")
        self.logger.log("Successfully create install path.")

        # 分发
        for dbNode in clusterInfo.dbNodes:
            if dbNode.name != GetHostIpOrName():
                cmd = "scp %s %s:%s" % (PkgPath, dbNode.sshIps[0], clusterInfo.toolPath)
                status, output = subprocess.getstatusoutput(cmd)
            else:
                cmd = 'cp %s %s' % (PkgPath, clusterInfo.toolPath)
                status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(f"failed to distribute install gr pkg, Error output:\n{output}")
        self.logger.log("Successfully distribute install gr pkg.")

        # 解压
        for dbNode in clusterInfo.dbNodes:
            cmd = 'tar -zxf %s/%s -C %s' % (clusterInfo.toolPath, pkgName, clusterInfo.toolPath)
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"failed to decompress gr pkg, Error output:\n{output}")
        self.logger.log("Successfully decompress gr pkg.")

        # 提取版本信息(从包名或包内文件,这里使用时间戳+commit ID作为版本标识)
        # 尝试从包名提取版本,格式如:gr-1.0.0.tar.gz 或 gr_1.0.0.tar.gz
        version_match = re.search(r'[_-](\d+\.\d+\.\d+)', pkgName)
        if version_match:
            version_str = version_match.group(1)
        else:
            # 如果包名中没有版本,使用时间戳
            version_str = time.strftime("%Y%m%d_%H%M%S")

        # 创建版本化目录结构:APP -> APP_VERSION
        install_base = clusterInfo.installPath
        app_link_name = "APP"
        app_version_dir = f"APP_{version_str}"
        app_link_path = os.path.join(install_base, app_link_name)
        app_version_path = os.path.join(install_base, app_version_dir)

        self.logger.log(f"Creating versioned directory structure: {app_link_name} -> {app_version_dir}")

        # 创建版本化目录和软链接
        for dbNode in clusterInfo.dbNodes:
            # 创建版本化目录
            cmd = f"mkdir -p {app_version_path}"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"Failed to create versioned directory on {dbNode.name}: {output}")

            # 只复制bin、lib、share到版本化目录(这些需要版本隔离)
            cmd = f"cp -r {clusterInfo.toolPath}/bin {clusterInfo.toolPath}/lib {app_version_path}/"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"Failed to copy bin/lib to versioned directory on {dbNode.name}: {output}")

            # 如果解压包中有share目录,也复制过去(在远程节点检查并复制)
            cmd = f"if [ -d {clusterInfo.toolPath}/share ]; then cp -r {clusterInfo.toolPath}/share {app_version_path}/; fi"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                self.logger.warn(f"Failed to copy share to versioned directory on {dbNode.name}: {output}")

            # 创建或更新APP软链接指向新版本
            cmd = f"ln -sfn {app_version_dir} {app_link_path}"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"Failed to create/update APP symlink on {dbNode.name}: {output}")

        self.logger.log(f"Successfully created versioned directory: {app_version_dir} and symlink: {app_link_name}")
        clusterInfo.appSoftPath = app_link_path
        clusterInfo.appPath = app_version_path
        
        # cfg目录在共享位置,不在版本化目录中
        cfg_path = os.path.join(clusterInfo.grPath, "cfg", "gr_inst.ini")
        cli_cfg_path = os.path.join(clusterInfo.grPath, "cfg", "gr_cli_inst.ini")
        parent_dir = os.path.dirname(cfg_path)

        # 收集所有节点的 IP 地址用于白名单
        all_node_ips = []
        for node in clusterInfo.dbNodes:
            if node.sshIps and node.sshIps[0] not in all_node_ips:
                all_node_ips.append(node.sshIps[0])
        all_node_ips.append("127.0.0.1")
        ip_white_list = ",".join(all_node_ips)
        self.logger.log(f"IP white list for all nodes: {ip_white_list}")

        # 设置 ca_path,如果未设置则使用默认值
        ca_path = clusterInfo.caPath
        if not ca_path:
            # 如果未设置,使用默认路径:wormPath/CA
            ca_path = "CA"
        # 修改配置文件
        for i, dbNode in enumerate(clusterInfo.dbNodes):
            # 直接使用分开的 listen_addr 和 listen_port
            if not dbNode.listen_addr:
                raise Exception(f"listen_addr is not set for instance {i}")
            if not dbNode.listen_port:
                raise Exception(f"listen_port is not set for instance {i}")
            
            config_content = (
                f"INST_ID = {i}\n"
                f"LOG_LEVEL = 255\n"
                f"GR_NODES_LIST = {clusterInfo.gr_nodes_list}\n"
                f"LISTEN_ADDR = {dbNode.listen_addr}\n"
                f"LISTEN_PORT = {dbNode.listen_port}\n"
                f"IP_WHITE_LIST = {ip_white_list}\n"
                f"DATA_FILE_PATH = {clusterInfo.wormPath}\n"
                f"SER_SSL_CA = {os.path.join(clusterInfo.wormPath, ca_path, 'cacert.pem')}\n"
                f"SER_SSL_KEY = {os.path.join(clusterInfo.wormPath, ca_path, 'server.key')}\n"
                f"SER_SSL_CERT = {os.path.join(clusterInfo.wormPath, ca_path, 'server.crt')}\n"
                f"SER_SSL_CRL = {os.path.join(clusterInfo.wormPath, ca_path, 'server.crl')}\n"
            )

            cli_config_content = (
                f"CLI_SSL_CA = {os.path.join(clusterInfo.wormPath, ca_path, 'cacert.pem')}\n"
                f"CLI_SSL_KEY = {os.path.join(clusterInfo.wormPath, ca_path, 'client.key')}\n"
                f"CLI_SSL_CERT = {os.path.join(clusterInfo.wormPath, ca_path, 'client.crt')}\n"
                f"CLI_SSL_CRL = {os.path.join(clusterInfo.wormPath, ca_path, 'client.crl')}\n"
            )
            cmd = f"mkdir -p {parent_dir} && " \
                  f"printf \"%s\" \"{config_content}\" > {cfg_path} && " \
                  f"printf \"%s\" \"{cli_config_content}\" > {cli_cfg_path}"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                self.logger.error(f"Failed to create/write gr config file on {dbNode.name}, output:\n{output}")
                raise Exception(f"failed to create/write gr config file, Error output:\n{output}")
        self.logger.log("Successfully create and write gr config file.")

        # 生成标志文件(CM)
        for dbNode in clusterInfo.dbNodes:
            cmd = 'touch %s/bin/cluster_manual_walrecord' % (clusterInfo.appPath)
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"failed to create cluster_manual_walrecord file, Error output:\n{output}")
        self.logger.log("Successfully create cluster_manual_walrecord file.")

        # 生成资源添加脚本
        add_cmd = (
            'cm_ctl res --add --res_name="gr" '
            '--res_attr="resources_type=APP,'
            'script={app_path}/bin/gr_contrl.sh,'
            'check_interval=1,'
            'timeout=120,'
            'restart_times=5,'
            'restart_delay=1,'
            'restart_period=1"'
        ).format(app_path=clusterInfo.appPath)

        edit_cmds = []
        for i, dbNode in enumerate(clusterInfo.dbNodes):
            edit_cmd = (
                'cm_ctl res --edit --res_name="gr" '
                '--add_inst="node_id={node_id},'
                'res_instance_id={instance_id},'
                'res_args={gr_path}"'
            ).format(
                node_id=dbNode.id,
                instance_id=i,
                gr_path=clusterInfo.grPath
            )
            edit_cmds.append(edit_cmd)
        
        if RestPkgPath:
            add_rest_cmd = ('cm_ctl res --add --res_name="CM-RestAPI" '
                            '--res_attr="resources_type=APP,'
                            'script={app_path}/bin/rest_contrl.sh,'
                            'check_interval=1,'
                            'timeout=120,'
                            'restart_times=10000,'
                            'restart_delay=1,'
                            'restart_period=1"'
                            ).format(app_path=clusterInfo.appPath)
            edit_rest_cmds = []
            for i, dbNode in enumerate(clusterInfo.dbNodes):
                edit_rest_cmd = (
                    'cm_ctl res --edit --res_name="CM-RestAPI" '
                    '--add_inst="node_id={node_id},'
                    'res_instance_id={instance_id},'
                    'res_args={rest_path}"'
                ).format(node_id=dbNode.id, instance_id=i, rest_path=clusterInfo.appPath)
                edit_rest_cmds.append(edit_rest_cmd)
        all_cmds = [add_cmd] + edit_cmds
        if RestPkgPath:
            all_cmds.append(add_rest_cmd)
            all_cmds.extend(edit_rest_cmds)
        for dbNode in clusterInfo.dbNodes:
            # 清理旧的 gr_res.sh
            cmd = 'rm -rf %s/gr_res.sh %s/*' % (clusterInfo.toolPath, dbNode.cmDataDir)
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status != 0:
                raise Exception(f"failed to clean gr_res.sh file, Error output:\n{output}")
            # 追加新内容
            for res_cmd in all_cmds:
                cmd = f"echo '{res_cmd}' >> {clusterInfo.toolPath}/gr_res.sh"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0:
                    raise Exception(f"failed to create gr_res.sh file, Error output:\n{output}")
        self.logger.log("Successfully create gr_res.sh file.")

        # 生成证书
        envfile = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        ca_path = clusterInfo.grPath
        if not ca_path.endswith("CA"):
            ca_path = os.path.join(ca_path, "CA")
        grcmd_bin = os.path.join(
            clusterInfo.appSoftPath if clusterInfo.appSoftPath else clusterInfo.appPath,
            "bin",
            "grcmd",
        )
        
        # 通过grcmd命令升生成证书
        cmd = f'source {envfile} && "{grcmd_bin}" gencert -t ca -d 1000 && "{grcmd_bin}" gencert -t server -d 1000 && "{grcmd_bin}" gencert -t client -d 1000'
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            raise Exception(f"Failed to create gr cert on local node, Error output:\n{output}, cmd is {cmd}")
        self.logger.log("Successfully create gr cert on local node.")

        self.logger.log("Successfully create gr cert.")

        # 分发脚本和工具
        bin_path = os.path.join(clusterInfo.appPath, "bin")
        script_path = os.path.join(clusterInfo.toolPath, "script")
        
        # gr_om
        self.distribute_file("gr_om", bin_path, clusterInfo.dbNodes)
        self.logger.log("Successfully distribute gr_om.")

        # pssh
        self.distribute_file("pssh/bin/*", bin_path, clusterInfo.dbNodes, set_executable=True, recursive=True)
        self.logger.log("Successfully distribute pssh.")

        # gr_contrl.sh脚本
        self.distribute_file("gr_contrl.sh", bin_path, clusterInfo.dbNodes, set_executable=True)
        self.logger.log("Successfully distribute gr_contrl.sh.")

        if RestPkgPath:
            RestPkg = os.path.basename(RestPkgPath)
            # 分发restapi包,也就是RestPkgPath
            for dbNode in clusterInfo.dbNodes:
                if dbNode.name != GetHostIpOrName():
                    cmd = "scp -r %s %s:%s/bin" % (RestPkgPath, dbNode.sshIps[0], clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
                    if status == 0:
                        # Set execute permissions for gr_contrl.sh
                        cmd = 'chmod +x %s/bin/%s' % (clusterInfo.appPath, RestPkg)
                        status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = 'cp -r %s %s/bin' % (RestPkgPath, clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
                    if status == 0:
                        # Set execute permissions for gr_contrl.sh
                        cmd = 'chmod +x %s/bin/%s' % (clusterInfo.appPath, RestPkg)
                        status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(f"failed to distribute restapi package, Error output:\n{output}")
            self.logger.log("Successfully distribute restapi package.")

            # 分发rest_contrl.sh脚本
            for dbNode in clusterInfo.dbNodes:
                if dbNode.name != GetHostIpOrName():
                    cmd = "scp rest_contrl.sh %s:%s/bin" % (dbNode.sshIps[0], clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
                    if status == 0:
                        # Set execute permissions for rest_contrl.sh
                        cmd = "ssh -q -o ConnectTimeout=5 %s 'chmod +x %s/bin/rest_contrl.sh'" % (dbNode.sshIps[0], clusterInfo.appPath)
                        status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = 'cp rest_contrl.sh %s/bin' % (clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
                    if status == 0:
                        cmd = 'chmod +x %s/bin/rest_contrl.sh' % (clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(f"failed to distribute rest_contrl.sh, Error output:\n{output}")
            self.logger.log("Successfully distribute rest_contrl.sh.")

            # 分发白名单restWhiteList
            for dbNode in clusterInfo.dbNodes:
                if dbNode.name != GetHostIpOrName():
                    cmd = "scp -r restWhiteList %s:%s/bin" % (dbNode.sshIps[0], clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = 'cp -r restWhiteList %s/bin' % (clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(f"failed to distribute restWhiteList, Error output:\n{output}")
            self.logger.log("Successfully distribute restWhiteList.")
            
            # 分发SSL配置文件rest_ssl.properties
            for dbNode in clusterInfo.dbNodes:
                if dbNode.name != GetHostIpOrName():
                    cmd = "scp rest_ssl.properties %s:%s/bin" % (dbNode.sshIps[0], clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = 'cp rest_ssl.properties %s/bin' % (clusterInfo.appPath)
                    status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                raise Exception(f"failed to distribute rest_ssl.properties, Error output:\n{output}")
            self.logger.log("Successfully distribute rest_ssl.properties.")

            rest_ip_map = {}
            for item in clusterInfo.rest_nodes_list.split(','):
                parts = item.split(':')
                if len(parts) >= 2:
                    rest_ip_map[int(parts[0])] = parts[1]
            for dbNode in clusterInfo.dbNodes:
                rest_ip = rest_ip_map.get(dbNode.id - 1, "")
                cmd = 'echo "server.address=%s" >> %s/bin/rest_ssl.properties' % (rest_ip, clusterInfo.appPath)
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0:
                    raise Exception(f"failed to set server.address in rest_ssl.properties, Error output:\n{output}")
            self.logger.log("Successfully set server.address in rest_ssl.properties.")

        # py_pstree.py脚本
        self.distribute_file("py_pstree.py", script_path, clusterInfo.dbNodes, set_executable=True)
        self.logger.log("Successfully distribute py_pstree.py.")

        self.logger.log("Successfully install oGRecorder.")

    def gr_upgrade(self, xmlFile, PkgPath, target_nodeids=None):
        """
        Upgrade GR application to new version with atomic symlink switching (rolling upgrade)
        
        滚动升级:逐个节点进行升级,确保服务持续可用
        
        Args:
            xmlFile: XML configuration file
            PkgPath: Path to new version installation package
            target_nodeids: List of node IDs to upgrade (starting from 1, None means all nodes)
        """
        user = getpass.getuser()
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if (gaussHome == ""):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]" % user))
        
        clusterInfo = dbClusterInfo()
        clusterInfo.initFromXml(xmlFile)
        # 初始化日志模块
        self.initLogger("upgrade")

        self.logger.log("Start upgrade oGRecorder (rolling upgrade - one node at a time).")
        
        # 确定要升级的节点(根据节点ID,从1开始)
        nodes_to_upgrade = []
        if target_nodeids is None or len(target_nodeids) == 0:
            nodes_to_upgrade = clusterInfo.dbNodes
            self.logger.log("Upgrading all nodes in cluster (one by one)")
        else:
            # 只升级指定的节点(根据节点ID)
            for node_id in target_nodeids:
                # 节点ID从1开始,但数组索引从0开始
                node_index = node_id - 1
                if node_index >= 0 and node_index < len(clusterInfo.dbNodes):
                    nodes_to_upgrade.append(clusterInfo.dbNodes[node_index])
                    self.logger.log(f"Node ID {node_id} ({clusterInfo.dbNodes[node_index].name}) will be upgraded")
                else:
                    self.logger.warn(f"Node ID {node_id} is out of range (valid range: 1-{len(clusterInfo.dbNodes)}), skipping")
        
        if len(nodes_to_upgrade) == 0:
            raise Exception("No valid nodes to upgrade")
        
        # 获取 cm_ctl 路径
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        toolPath = self.__getEnvironmentParameterValue("GPHOME", user)
        cm_ctl_path = None
        if gaussHome:
            cm_ctl_path = os.path.join(gaussHome, "bin", "cm_ctl")
        elif toolPath:
            cm_ctl_path = os.path.join(toolPath, "bin", "cm_ctl")
        else:
            # 尝试在 PATH 中查找
            status, output = subprocess.getstatusoutput("which cm_ctl")
            if status == 0:
                cm_ctl_path = output.strip()
        
        if not cm_ctl_path:
            self.logger.warn("cm_ctl not found, will skip service stop/start operations")
        else:
            self.logger.log(f"Using cm_ctl: {cm_ctl_path}")
        
        # 使用时间戳作为版本号
        pkgName = os.path.basename(PkgPath)
        version_str = time.strftime("%Y%m%d_%H%M%S")
        
        self.logger.log(f"Upgrading to version: {version_str} (package: {pkgName})")
        
        # 获取当前版本信息
        install_base = clusterInfo.installPath
        app_link_name = "APP"
        app_link_path = os.path.join(install_base, app_link_name)
        app_version_dir = f"APP_{version_str}"
        app_version_path = os.path.join(install_base, app_version_dir)
        backup_dir = os.path.join(install_base, "backup")
        timestamp = time.strftime("%Y%m%d%H%M%S")
        
        # 获取环境变量文件路径
        envfile_path = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        env_source = f"source {envfile_path} && " if envfile_path else ""
        
        # 记录每个节点的状态信息(用于回滚)
        node_rollback_info = {}  # {node_name: {'old_version_dir': '...', 'old_version_name': '...', 'services_stopped': True/False, 'symlink_switched': False, 'upgrade_completed': False}}
        
        # 预先记录所有节点的当前版本信息
        for dbNode in nodes_to_upgrade:
            cmd = f"readlink -f {app_link_path} 2>/dev/null || echo ''"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status == 0 and output.strip():
                current_version_dir = output.strip()
                current_version_name = os.path.basename(current_version_dir)
                node_rollback_info[dbNode.name] = {
                    'old_version_dir': current_version_dir,
                    'old_version_name': current_version_name,
                    'services_stopped': False,
                    'symlink_switched': False,
                    'upgrade_completed': False
                }
                self.logger.log(f"Recorded rollback info for {dbNode.name}: {current_version_name}")
            else:
                node_rollback_info[dbNode.name] = {
                    'old_version_dir': None,
                    'old_version_name': None,
                    'services_stopped': False,
                    'symlink_switched': False,
                    'upgrade_completed': False
                }
                self.logger.warn(f"No current version found for {dbNode.name}, rollback may not be possible")
        
        # 分发安装包到所有远程节点(提前分发,避免升级过程中分发失败)
        self.logger.log("Pre-distributing package to remote nodes...")
        for dbNode in nodes_to_upgrade:
            if dbNode.name != GetHostIpOrName():
                # 始终分发新包到远端(覆盖旧版本)
                remote_pkg_path = os.path.join(clusterInfo.toolPath, pkgName)
                self.logger.log(f"Distributing package to {dbNode.name}: {remote_pkg_path}")
                cmd = f"scp {PkgPath} {dbNode.sshIps[0]}:{remote_pkg_path}"
                status, output = subprocess.getstatusoutput(cmd)
                if status != 0:
                    raise Exception(f"Failed to distribute package to {dbNode.name}: {output}")
                self.logger.log(f"Package distributed to {dbNode.name}")
        
        # 升级成功的节点列表
        upgraded_nodes = []
        
        # 滚动升级:逐个节点进行
        self.logger.log("=" * 60)
        self.logger.log(f"Starting rolling upgrade for {len(nodes_to_upgrade)} node(s)")
        self.logger.log("=" * 60)
        
        for node_index, dbNode in enumerate(nodes_to_upgrade):
            node_id = clusterInfo.dbNodes.index(dbNode) + 1
            
            self.logger.log("")
            self.logger.log("=" * 60)
            self.logger.log(f"UPGRADING NODE {node_index + 1}/{len(nodes_to_upgrade)}: {dbNode.name} (ID: {node_id})")
            self.logger.log("=" * 60)
            
            try:
                # ===== 步骤1: 升级前检查该节点 =====
                self.logger.log(f"[{dbNode.name}] Step 1: Pre-upgrade check")
                self._check_gr_status(user, [dbNode], f"pre-upgrade node {node_id}")
                
                # 检查当前软链接是否存在
                cmd = f"test -L {app_link_path} && readlink -f {app_link_path} || echo ''"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status == 0 and output.strip():
                    current_version_dir = output.strip()
                    self.logger.log(f"[{dbNode.name}] Current version: {current_version_dir}")
                else:
                    self.logger.warn(f"[{dbNode.name}] No current version found, this appears to be a fresh installation")
                
                # ===== 步骤2: 停止该节点服务 =====
                if cm_ctl_path:
                    self.logger.log(f"[{dbNode.name}] Step 2: Stopping services on node {node_id}")
                    
                    if dbNode.name == GetHostIpOrName():
                        cmd = f"{env_source}{cm_ctl_path} stop -n {node_id}"
                        status, output = subprocess.getstatusoutput(cmd)
                    else:
                        cmd = f"ssh -q -o ConnectTimeout=5 {dbNode.sshIps[0]} '{env_source}{cm_ctl_path} stop -n {node_id}'"
                        status, output = subprocess.getstatusoutput(cmd)
                    
                    if status != 0:
                        self.logger.warn(f"[{dbNode.name}] Failed to stop services: {output}")
                        self.logger.warn(f"[{dbNode.name}] Service stop failed, but continuing with upgrade...")
                    else:
                        self.logger.log(f"[{dbNode.name}] Services stopped successfully")
                    
                    node_rollback_info[dbNode.name]['services_stopped'] = True
                    time.sleep(2)
                
                # ===== 步骤3: 备份当前版本 =====
                self.logger.log(f"[{dbNode.name}] Step 3: Backup current version")
                
                cmd = f"mkdir -p {backup_dir}"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                cmd = f"readlink -f {app_link_path} 2>/dev/null || echo ''"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                if status == 0 and output.strip():
                    current_version_dir = output.strip()
                    current_version_name = os.path.basename(current_version_dir)
                    backup_file = os.path.join(backup_dir, f"{current_version_name}-backup-{timestamp}.tar.gz")
                    
                    cmd = f"tar -czf {backup_file} -C {install_base} {current_version_name} 2>/dev/null && echo 'OK' || echo 'FAILED'"
                    status, output = self.execute_on_node(
                        dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                    if status == 0 and 'OK' in output:
                        self.logger.log(f"[{dbNode.name}] Backed up to {backup_file}")
                    else:
                        self.logger.warn(f"[{dbNode.name}] Backup failed: {output}")
                
                # ===== 步骤4: 安装新版本 =====
                self.logger.log(f"[{dbNode.name}] Step 4: Install new version")
                
                # 如果版本目录已存在,先删除
                cmd = f"test -d {app_version_path} && rm -rf {app_version_path} || true"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 获取当前版本目录,如果存在则复制原有内容
                cmd = f"readlink -f {app_link_path} 2>/dev/null || echo ''"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                if status == 0 and output.strip():
                    current_version_dir = output.strip()
                    self.logger.log(f"[{dbNode.name}] Copying existing directory from {current_version_dir}")
                    cmd = f"cp -a {current_version_dir} {app_version_path}"
                    status, output = self.execute_on_node(
                        dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                    if status != 0:
                        self.logger.warn(f"[{dbNode.name}] Failed to copy existing directory, creating new: {output}")
                        cmd = f"mkdir -p {app_version_path}"
                        status, output = self.execute_on_node(
                            dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                        if status != 0:
                            raise Exception(f"Failed to create version directory on {dbNode.name}: {output}")
                else:
                    self.logger.log(f"[{dbNode.name}] No existing version found, creating new directory")
                    cmd = f"mkdir -p {app_version_path}"
                    status, output = self.execute_on_node(
                        dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                    if status != 0:
                        raise Exception(f"Failed to create version directory on {dbNode.name}: {output}")
                
                # 解压安装包
                local_pkg_path = PkgPath if dbNode.name == GetHostIpOrName() else os.path.join(clusterInfo.toolPath, pkgName)
                temp_extract_dir = os.path.join(clusterInfo.toolPath, f"temp_extract_{version_str}")
                
                cmd = f"rm -rf {temp_extract_dir} && mkdir -p {temp_extract_dir} && tar -zxf {local_pkg_path} -C {temp_extract_dir} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or 'FAILED' in output:
                    raise Exception(f"Failed to decompress package on {dbNode.name}: {output}")
                
                # 用新包内容覆盖
                self.logger.log(f"[{dbNode.name}] Replacing files with new package content")
                cmd = f"if [ -d {temp_extract_dir}/bin ]; then cp -r {temp_extract_dir}/bin/* {app_version_path}/bin/ 2>/dev/null || cp -r {temp_extract_dir}/bin {app_version_path}/; fi && " \
                      f"if [ -d {temp_extract_dir}/lib ]; then cp -r {temp_extract_dir}/lib/* {app_version_path}/lib/ 2>/dev/null || cp -r {temp_extract_dir}/lib {app_version_path}/; fi && " \
                      f"if [ -d {temp_extract_dir}/share ]; then cp -r {temp_extract_dir}/share/* {app_version_path}/share/ 2>/dev/null || cp -r {temp_extract_dir}/share {app_version_path}/; fi && " \
                      f"if [ -d {temp_extract_dir}/include ]; then cp -r {temp_extract_dir}/include/* {app_version_path}/include/ 2>/dev/null || cp -r {temp_extract_dir}/include {app_version_path}/; fi && " \
                      f"if [ -d {temp_extract_dir}/add-ons ]; then cp -r {temp_extract_dir}/add-ons/* {app_version_path}/add-ons/ 2>/dev/null || cp -r {temp_extract_dir}/add-ons {app_version_path}/; fi && " \
                      f"echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or 'FAILED' in output:
                    self.logger.warn(f"[{dbNode.name}] Some files may not have been replaced: {output}")
                
                # 复制其他文件
                cmd = f"find {temp_extract_dir} -maxdepth 1 -type f -exec cp -f {{}} {app_version_path}/ \\; 2>/dev/null || true"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 清理临时目录
                cmd = f"rm -rf {temp_extract_dir}"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 设置权限
                cmd = f"find {app_version_path} -type d -exec chmod 755 {{}} \\;"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                cmd = f"find {app_version_path}/bin -type f -exec chmod 755 {{}} \\; 2>/dev/null || true"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                cmd = f"find {app_version_path}/lib -type f -exec chmod 644 {{}} \\; 2>/dev/null || true"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 修复SSL证书权限
                sslcert_path = os.path.join(app_version_path, "share", "sslcert")
                cmd = f"if [ -d {sslcert_path} ]; then " \
                      f"find {sslcert_path} -type f \\( -name '*.pem' -o -name '*.crt' -o -name '*.key' -o -name '*.csr' \\) -exec chmod 400 {{}} \\; && " \
                      f"find {sslcert_path} -type d -exec chmod 700 {{}} \\; && " \
                      f"echo 'SSL cert permissions fixed'; fi"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                cmd = f"find {app_version_path} -type f ! -path '*/bin/*' ! -path '*/lib/*' ! -path '*/share/sslcert/*' -exec chmod 644 {{}} \\; 2>/dev/null || true"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                self.logger.log(f"[{dbNode.name}] New version installed: {app_version_path}")
                
                # ===== 步骤5: 验证新版本 =====
                self.logger.log(f"[{dbNode.name}] Step 5: Verify new version")
                key_files = ["bin/grserver", "bin/grcmd"]
                for key_file in key_files:
                    cmd = f"test -f {app_version_path}/{key_file} && echo 'EXISTS' || echo 'MISSING'"
                    status, output = self.execute_on_node(
                        dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                    if status != 0 or 'MISSING' in output:
                        raise Exception(f"Required file missing on {dbNode.name}: {key_file}")
                
                # ===== 步骤6: 原子性切换软链接 =====
                self.logger.log(f"[{dbNode.name}] Step 6: Atomically switch symlink")
                link_name = os.path.basename(app_link_path)
                temp_link_name = f"{link_name}.new"
                temp_link_path = os.path.join(install_base, temp_link_name)
                
                # 删除临时链接
                cmd = f"rm -f {temp_link_path}"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 创建临时软链接
                cmd = f"cd {install_base} && ln -sfn {app_version_dir} {temp_link_name}"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0:
                    raise Exception(f"Failed to create temporary symlink on {dbNode.name}: {output}")
                
                # 标记软链接即将被修改
                node_rollback_info[dbNode.name]['symlink_switched'] = True
                
                # 删除旧链接并移动新链接
                cmd = f"cd {install_base} && rm -f {link_name} && mv {temp_link_name} {link_name} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or 'FAILED' in output:
                    raise Exception(f"Failed to switch symlink on {dbNode.name}: {output}")
                
                # 验证软链接
                cmd = f"readlink {app_link_path}"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or output.strip() != app_version_dir:
                    # 强制重新创建
                    cmd = f"cd {install_base} && rm -f {link_name} && ln -sfn {app_version_dir} {link_name}"
                    status, output = self.execute_on_node(
                        dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                    if status != 0:
                        raise Exception(f"Failed to force recreate symlink on {dbNode.name}: {output}")
                
                self.logger.log(f"[{dbNode.name}] Symlink switched: {app_link_path} -> {app_version_dir}")
                
                # ===== 步骤7: 后检查 =====
                self.logger.log(f"[{dbNode.name}] Step 7: Post-upgrade check")
                cmd = f"test -f {app_link_path}/bin/grserver && test -f {app_link_path}/bin/grcmd && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or 'FAILED' in output:
                    raise Exception(f"Post-upgrade verification failed on {dbNode.name}")
                
                # ===== 步骤8: 启动该节点服务 =====
                if cm_ctl_path:
                    self.logger.log(f"[{dbNode.name}] Step 8: Starting services on node {node_id}")
                    
                    if dbNode.name == GetHostIpOrName():
                        cmd = f"{env_source}{cm_ctl_path} start -n {node_id} -t 20"
                        status, output = subprocess.getstatusoutput(cmd)
                    else:
                        cmd = f"ssh -q -o ConnectTimeout=5 {dbNode.sshIps[0]} '{env_source}{cm_ctl_path} start -n {node_id} -t 20'"
                        status, output = subprocess.getstatusoutput(cmd)
                    
                    if status != 0:
                        # 服务启动失败,需要回滚
                        raise Exception(f"Failed to start services on node {node_id}: {output}")
                    
                    self.logger.log(f"[{dbNode.name}] Services started successfully")
                    node_rollback_info[dbNode.name]['services_stopped'] = False
                    
                    # 等待服务完全启动
                    time.sleep(5)
                
                # ===== 步骤9: 验证该节点服务状态(带重试) =====
                self.logger.log(f"[{dbNode.name}] Step 9: Verify service status")
                max_retries = 3
                retry_interval = 10  # 秒
                service_check_passed = False
                last_error = None
                
                for retry in range(max_retries):
                    try:
                        self._check_gr_status(user, [dbNode], f"post-upgrade node {node_id}")
                        service_check_passed = True
                        break
                    except Exception as check_error:
                        last_error = check_error
                        if retry < max_retries - 1:
                            self.logger.warn(f"[{dbNode.name}] Service status check failed (attempt {retry + 1}/{max_retries}): {check_error}")
                            self.logger.log(f"[{dbNode.name}] Waiting {retry_interval}s before retry...")
                            time.sleep(retry_interval)
                        else:
                            self.logger.warn(f"[{dbNode.name}] Service status check failed after {max_retries} attempts")
                
                if not service_check_passed:
                    # 服务状态检查最终失败,需要回滚
                    raise Exception(f"Service status check failed on {dbNode.name} after {max_retries} retries: {last_error}")
                
                self.logger.log(f"[{dbNode.name}] Service status verified successfully")
                
                # 标记该节点升级完成
                node_rollback_info[dbNode.name]['upgrade_completed'] = True
                upgraded_nodes.append(dbNode)
                
                self.logger.log(f"[{dbNode.name}] ✓ Node upgrade completed successfully!")
                self.logger.log("")
                
            except Exception as e:
                # 该节点升级失败,执行回滚
                self.logger.log("")
                self.logger.log("=" * 60)
                self.logger.log(f"NODE {dbNode.name} UPGRADE FAILED! Starting full rollback...")
                self.logger.log("=" * 60)
                self.logger.log(f"Error: {str(e)}")
                
                # 需要回滚的节点:所有已升级成功的节点 + 当前失败的节点
                nodes_to_rollback = upgraded_nodes + [dbNode]
                self.logger.log(f"Nodes to rollback: {[n.name for n in nodes_to_rollback]}")
                
                try:
                    # 回滚所有已升级的节点和当前失败的节点
                    self._rollback_upgrade(clusterInfo, nodes_to_rollback, node_rollback_info, 
                                          cm_ctl_path, user, version_str, install_base, app_link_path)
                    self.logger.log("Rollback completed successfully")
                except Exception as rollback_error:
                    self.logger.log("=" * 60)
                    self.logger.log("ROLLBACK FAILED! Manual intervention required!")
                    self.logger.log("=" * 60)
                    self.logger.log(f"Rollback error: {str(rollback_error)}")
                    self.logger.log(f"Please manually restore the following nodes:")
                    for n in nodes_to_rollback:
                        info = node_rollback_info.get(n.name, {})
                        self.logger.log(f"  {n.name}: old_version={info.get('old_version_dir', 'UNKNOWN')}")
                
                # 停止升级,不继续处理后续节点
                self.logger.log("")
                self.logger.log("=" * 60)
                self.logger.log("ROLLING UPGRADE ABORTED AND ROLLED BACK!")
                self.logger.log("=" * 60)
                self.logger.log(f"Rolled back nodes: {[n.name for n in nodes_to_rollback]}")
                self.logger.log(f"Failed node: {dbNode.name}")
                remaining_nodes = nodes_to_upgrade[node_index + 1:]
                if remaining_nodes:
                    self.logger.log(f"Remaining nodes (not upgraded): {[n.name for n in remaining_nodes]}")
                
                raise Exception(f"Rolling upgrade failed at node {dbNode.name}, all upgraded nodes rolled back: {str(e)}")
        
        # 所有节点升级成功
        self.logger.log("")
        self.logger.log("=" * 60)
        self.logger.log("ROLLING UPGRADE COMPLETED SUCCESSFULLY!")
        self.logger.log("=" * 60)
        self.logger.log(f"Upgraded {len(upgraded_nodes)} node(s): {[n.name for n in upgraded_nodes]}")
        self.logger.log(f"New version: {version_str}")
    
    def _rollback_upgrade(self, clusterInfo, nodes_to_upgrade, node_rollback_info, 
                         cm_ctl_path, user, new_version_str, install_base, app_link_path):
        """
        回滚升级:恢复软链接到旧版本,删除新版本目录,重启服务
        
        Args:
            clusterInfo: 集群信息
            nodes_to_upgrade: 要回滚的节点列表
            node_rollback_info: 回滚信息字典
            cm_ctl_path: cm_ctl 路径
            user: 用户名
            new_version_str: 新版本字符串(用于删除新版本目录)
            install_base: 安装基础目录
            app_link_path: APP 软链接路径
        """
        self.logger.log("=" * 60)
        self.logger.log("ROLLBACK: Restoring previous version")
        self.logger.log("=" * 60)
        
        app_version_dir = f"APP_{new_version_str}"
        app_version_path = os.path.join(install_base, app_version_dir)
        link_name = os.path.basename(app_link_path)
        
        # 获取环境变量文件路径
        envfile_path = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        env_source = f"source {envfile_path} && " if envfile_path else ""
        
        for dbNode in nodes_to_upgrade:
            rollback_info = node_rollback_info.get(dbNode.name, {})
            old_version_name = rollback_info.get('old_version_name')
            old_version_dir = rollback_info.get('old_version_dir')
            services_stopped = rollback_info.get('services_stopped', False)
            symlink_switched = rollback_info.get('symlink_switched', False)
            
            self.logger.log(f"Rolling back node {dbNode.name}...")
            node_id = clusterInfo.dbNodes.index(dbNode) + 1
            
            # 1. 先停止服务(不管当前状态,确保服务停止后再切换软链接)
            if cm_ctl_path:
                self.logger.log(f"Stopping services on node {node_id} ({dbNode.name}) before rollback")
                
                if dbNode.name == GetHostIpOrName():
                    cmd = f"{env_source}{cm_ctl_path} stop -n {node_id}"
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = f"ssh -q -o ConnectTimeout=5 {dbNode.sshIps[0]} '{env_source}{cm_ctl_path} stop -n {node_id}'"
                    status, output = subprocess.getstatusoutput(cmd)
                
                if status != 0:
                    self.logger.warn(f"Failed to stop services on node {node_id} ({dbNode.name}): {output}")
                    self.logger.warn("Services may already be stopped, continuing with rollback...")
                else:
                    self.logger.log(f"Services stopped on node {node_id} ({dbNode.name})")
                
                time.sleep(2)
            
            # 2. 恢复软链接到旧版本(只要有旧版本信息就尝试恢复)
            # 注意:即使 symlink_switched=False,也可能软链接已被删除,所以只要有旧版本就尝试恢复
            if old_version_name:
                self.logger.log(f"Restoring symlink on {dbNode.name} to {old_version_name}")
                
                # 检查当前软链接状态
                cmd = f"readlink {app_link_path} 2>/dev/null || echo 'NOT_EXISTS'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                current_target = output.strip() if status == 0 else 'NOT_EXISTS'
                
                # 如果软链接不存在或指向新版本,则恢复到旧版本
                if current_target == 'NOT_EXISTS' or current_target != old_version_name:
                    cmd = f"cd {install_base} && rm -f {link_name} && ln -sfn {old_version_name} {link_name} && echo 'OK' || echo 'FAILED'"
                    status, output = self.execute_on_node(
                        dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                    if status == 0 and 'OK' in output:
                        self.logger.log(f"Symlink restored on {dbNode.name}: {link_name} -> {old_version_name}")
                    else:
                        self.logger.warn(f"Failed to restore symlink on {dbNode.name}: {output}")
                else:
                    self.logger.log(f"Symlink on {dbNode.name} already points to old version: {old_version_name}")
            
            # 3. 删除新版本目录(如果存在)
            self.logger.log(f"Removing new version directory on {dbNode.name}: {app_version_path}")
            cmd = f"rm -rf {app_version_path}"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status == 0:
                self.logger.log(f"New version directory removed on {dbNode.name}")
            else:
                self.logger.warn(f"Failed to remove new version directory on {dbNode.name}: {output}")
            
            # 4. 重启服务(总是尝试重启,确保服务恢复运行)
            if cm_ctl_path and old_version_name:
                self.logger.log(f"Restarting services on node {node_id} ({dbNode.name}) with old version")
                
                if dbNode.name == GetHostIpOrName():
                    cmd = f"{env_source}{cm_ctl_path} start -n {node_id} -t 20"
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = f"ssh -q -o ConnectTimeout=5 {dbNode.sshIps[0]} '{env_source}{cm_ctl_path} start -n {node_id} -t 20'"
                    status, output = subprocess.getstatusoutput(cmd)
                
                if status != 0:
                    self.logger.warn(f"Failed to restart services on node {node_id} ({dbNode.name}): {output}")
                    self.logger.warn(f"Please manually restart: cm_ctl start -n {node_id} -t 20")
                else:
                    self.logger.log(f"Services restarted successfully on node {node_id} ({dbNode.name})")
                
                time.sleep(3)
        
        self.logger.log("=" * 60)
        self.logger.log("ROLLBACK COMPLETED")
        self.logger.log("=" * 60)
        self.logger.log("All nodes have been rolled back to previous version")
        self.logger.log("Please verify the system status manually")

    def cm_cluster_upgrade(self, xmlFile, cmPkgPath):
        """
        集群 CM 升级(无 nodeid 模式):
          - 停整集群(cm_ctl stop -n)
          - 在当前 APP 目录下覆盖 CM 的 bin/lib
          - 启动整集群(cm_ctl start -n)
        仅覆盖二进制和库文件,不重新走 cm_install 的目录/配置流程。
        支持失败回滚:恢复备份的 bin/lib,重启服务。
        """
        user = getpass.getuser()
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if gaussHome == "":
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
                            ("installation path of designated user [%s]" % user))

        clusterInfo = dbClusterInfo()
        clusterInfo.initFromXml(xmlFile)
        self.initLogger("cm_cluster_upgrade")

        self.logger.log("Start CM cluster upgrade (full cluster stop required).")

        nodes_to_upgrade = clusterInfo.dbNodes
        if not nodes_to_upgrade:
            raise Exception("No nodes found in cluster for CM upgrade")

        install_base = clusterInfo.installPath
        app_link_path = os.path.join(install_base, "APP")

        # 获取 cm_ctl 路径
        toolPath = self.__getEnvironmentParameterValue("GPHOME", user)
        cm_ctl_path = None
        if gaussHome:
            cm_ctl_path = os.path.join(gaussHome, "bin", "cm_ctl")
        elif toolPath:
            cm_ctl_path = os.path.join(toolPath, "bin", "cm_ctl")
        else:
            status, output = subprocess.getstatusoutput("which cm_ctl")
            if status == 0:
                cm_ctl_path = output.strip()

        if not cm_ctl_path:
            raise Exception("cm_ctl not found, cannot perform CM cluster upgrade")

        self.logger.log(f"Using cm_ctl: {cm_ctl_path}")

        # 记录回滚信息:每个节点的 bin/lib 备份路径、服务停止状态、bin/lib 是否已覆盖
        node_rollback_info = {}
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        
        # 预先记录所有节点的 APP 路径和备份信息
        for dbNode in nodes_to_upgrade:
            cmd = f"readlink -f {app_link_path} 2>/dev/null || echo ''"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status == 0 and output.strip():
                app_real_path = output.strip()
                backup_dir = os.path.join(clusterInfo.toolPath, f"cm_backup_{timestamp}")
                node_rollback_info[dbNode.name] = {
                    'app_real_path': app_real_path,
                    'backup_dir': backup_dir,
                    'services_stopped': False,
                    'bin_lib_overlaid': False
                }
                self.logger.log(f"Recorded rollback info for {dbNode.name}: APP={app_real_path}")
            else:
                raise Exception(f"Failed to resolve APP symlink on {dbNode.name} for rollback preparation")

        envfile_path = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        env_source = f"source {envfile_path} && " if envfile_path else ""

        # 使用 try-except 包装整个升级流程,实现失败回滚
        upgrade_success = False
        try:
            # 升级前检查: 验证 CM 集群状态正常
            self._check_cm_status(user, "pre-upgrade")
            
            # Step CM-0: 停整集群
            self.logger.log("Step CM-0: Stopping CM services on all nodes")
            self.logger.log(f"Stopping services for CM upgrade")
            
            # 使用 cm_ctl stop 一次性停止所有节点
            cmd = f"{env_source}{cm_ctl_path} stop"
            status, output = subprocess.getstatusoutput(cmd)
            time.sleep(2)

            # Step CM-0.5: 备份每个节点的 bin/lib(CM 相关文件)
            self.logger.log("Step CM-0.5: Backup CM bin/lib on all nodes")
            for dbNode in nodes_to_upgrade:
                rollback_info = node_rollback_info[dbNode.name]
                app_real_path = rollback_info['app_real_path']
                backup_dir = rollback_info['backup_dir']
                
                # 创建备份目录
                cmd = f"mkdir -p {backup_dir}"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0:
                    raise Exception(f"Failed to create backup directory on {dbNode.name}: {output}")
                
                # 备份 bin 目录(只备份 CM 相关文件,如 cm_ctl, cm_server, cm_agent 等)
                # 为了简化,直接备份整个 bin 目录(因为可能还有其他 GR 文件)
                backup_bin = os.path.join(backup_dir, "bin")
                cmd = f"cp -a {app_real_path}/bin {backup_bin} 2>/dev/null && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status == 0 and 'OK' in output:
                    self.logger.log(f"Backed up bin directory on {dbNode.name}")
                else:
                    self.logger.warn(f"Backup bin directory failed on {dbNode.name}: {output}")
                
                # 备份 lib 目录
                backup_lib = os.path.join(backup_dir, "lib")
                cmd = f"cp -a {app_real_path}/lib {backup_lib} 2>/dev/null && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status == 0 and 'OK' in output:
                    self.logger.log(f"Backed up lib directory on {dbNode.name}")
                else:
                    self.logger.warn(f"Backup lib directory failed on {dbNode.name}: {output}")

            # 提取 cm 包基本信息
            cm_pkg_name = os.path.basename(cmPkgPath)

            # Step CM-1: 在每个节点覆盖 APP/bin 和 APP/lib 里的 CM 组件
            self.logger.log("Step CM-1: Overlay CM bin/lib from cmpkg into APP on all nodes")
            for dbNode in nodes_to_upgrade:
                rollback_info = node_rollback_info[dbNode.name]
                app_real_path = rollback_info['app_real_path']
                
                # 确定本地 cm 包路径(远端需要先分发)
                local_cm_pkg = cmPkgPath
                if dbNode.name != GetHostIpOrName():
                    target_pkg_path = os.path.join(clusterInfo.toolPath, cm_pkg_name)
                    # 始终分发新包到远端(覆盖旧版本)
                    self.logger.log(f"  Distributing CM package to {dbNode.name}: {target_pkg_path}")
                    cmd = f"scp {cmPkgPath} {dbNode.sshIps[0]}:{target_pkg_path}"
                    status, output = subprocess.getstatusoutput(cmd)
                    if status != 0:
                        raise Exception(f"Failed to distribute CM package to {dbNode.name}: {output}")
                    local_cm_pkg = target_pkg_path

                # 在远端临时目录解压 cm 包
                temp_dir = os.path.join(clusterInfo.toolPath, f"cm_upgrade_{timestamp}")
                cmd = f"rm -rf {temp_dir} && mkdir -p {temp_dir} && tar -zxf {local_cm_pkg} -C {temp_dir} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or 'FAILED' in output:
                    raise Exception(f"Failed to decompress CM package on {dbNode.name}: {output}")

                # 仅覆盖 bin / lib 目录;其余(配置、证书等)保持不变
                self.logger.log(f"Node {dbNode.name}: overlay CM bin/lib into {app_real_path}")
                overlay_cmd = (
                    f"if [ -d {temp_dir}/bin ]; then "
                    f"  mkdir -p {app_real_path}/bin; "
                    f"  cp -rf {temp_dir}/bin/* {app_real_path}/bin/ 2>/dev/null || cp -rf {temp_dir}/bin {app_real_path}/; "
                    f"fi; "
                    f"if [ -d {temp_dir}/lib ]; then "
                    f"  mkdir -p {app_real_path}/lib; "
                    f"  cp -rf {temp_dir}/lib/* {app_real_path}/lib/ 2>/dev/null || cp -rf {temp_dir}/lib {app_real_path}/; "
                    f"fi; "
                    f"echo 'OK' || echo 'FAILED'"
                )
                status, output = self.execute_on_node(
                    dbNode.name, overlay_cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0 or 'FAILED' in output:
                    raise Exception(f"Failed to overlay CM bin/lib on {dbNode.name}: {output}")
                
                # 标记 bin/lib 已覆盖(用于回滚判断)
                node_rollback_info[dbNode.name]['bin_lib_overlaid'] = True

                # 基本权限修复:bin 可执行,lib 只读
                perm_cmds = [
                    f"find {app_real_path}/bin -type f -exec chmod 755 {{}} \\; 2>/dev/null || true",
                    f"find {app_real_path}/lib -type f -exec chmod 644 {{}} \\; 2>/dev/null || true",
                ]
                for pcmd in perm_cmds:
                    self.execute_on_node(
                        dbNode.name, pcmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])

                # 清理临时目录
                cmd = f"rm -rf {temp_dir}"
                self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])

            # Step CM-2: 启动整集群
            self.logger.log("Step CM-2: Starting CM services on all nodes after CM upgrade")
            # 使用 cm_ctl start 一次性启动所有节点
            cmd = f"{env_source}{cm_ctl_path} start"
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                self.logger.warn(f"Failed to start CM services: {output}")
                self.logger.warn(f"Please manually start CM services using: cm_ctl start")
            else:
                self.logger.log(f"CM services started successfully")
            
            # 等待 CM 服务完全启动
            self.logger.log("Waiting for CM services to fully start...")
            time.sleep(5)
            
            # 升级后检查: 验证 CM 集群状态正常
            self._check_cm_status(user, "post-upgrade")
            
            # 升级成功
            upgrade_success = True
            self.logger.log("=" * 60)
            self.logger.log("CM CLUSTER UPGRADE COMPLETED SUCCESSFULLY!")
            self.logger.log("=" * 60)
            
        except Exception as e:
            # 升级失败,执行回滚
            self.logger.log("=" * 60)
            self.logger.log("CM UPGRADE FAILED! Starting rollback process...")
            self.logger.log("=" * 60)
            self.logger.log(f"Error: {str(e)}")
            
            try:
                self._rollback_cm_upgrade(clusterInfo, nodes_to_upgrade, node_rollback_info, 
                                         cm_ctl_path, user, env_source)
            except Exception as rollback_error:
                self.logger.log("=" * 60)
                self.logger.log("CM ROLLBACK FAILED! Manual intervention required!")
                self.logger.log("=" * 60)
                self.logger.log(f"Rollback error: {str(rollback_error)}")
                self.logger.log("Please manually restore the system:")
                for dbNode in nodes_to_upgrade:
                    rollback_info = node_rollback_info.get(dbNode.name, {})
                    app_real_path = rollback_info.get('app_real_path', 'UNKNOWN')
                    backup_dir = rollback_info.get('backup_dir', 'UNKNOWN')
                    self.logger.log(f"  Node {dbNode.name}: Restore bin/lib from {backup_dir} to {app_real_path}")
                    if rollback_info.get('services_stopped', False):
                        node_id = clusterInfo.dbNodes.index(dbNode) + 1
                        self.logger.log(f"  Node {dbNode.name}: Restart services: cm_ctl start -n {node_id} -t 20")
            
            # 重新抛出原始异常
            raise
        
        if not upgrade_success:
            raise Exception("CM upgrade process completed but success flag not set")
    
    def _rollback_cm_upgrade(self, clusterInfo, nodes_to_upgrade, node_rollback_info, 
                            cm_ctl_path, user, env_source):
        """
        回滚 CM 升级:恢复备份的 bin/lib,重启服务
        
        Args:
            clusterInfo: 集群信息
            nodes_to_upgrade: 要回滚的节点列表
            node_rollback_info: 回滚信息字典
            cm_ctl_path: cm_ctl 路径
            user: 用户名
            env_source: 环境变量 source 命令前缀
        """
        self.logger.log("=" * 60)
        self.logger.log("CM ROLLBACK: Restoring previous CM bin/lib")
        self.logger.log("=" * 60)
        
        for dbNode in nodes_to_upgrade:
            rollback_info = node_rollback_info.get(dbNode.name, {})
            app_real_path = rollback_info.get('app_real_path')
            backup_dir = rollback_info.get('backup_dir')
            services_stopped = rollback_info.get('services_stopped', False)
            bin_lib_overlaid = rollback_info.get('bin_lib_overlaid', False)
            
            self.logger.log(f"Rolling back CM upgrade on node {dbNode.name}...")
            
            # 1. 如果 bin/lib 已覆盖,恢复备份
            if bin_lib_overlaid and backup_dir and app_real_path:
                self.logger.log(f"Restoring bin/lib from backup on {dbNode.name}")
                
                # 恢复 bin 目录
                backup_bin = os.path.join(backup_dir, "bin")
                cmd = f"if [ -d {backup_bin} ]; then "
                cmd += f"  rm -rf {app_real_path}/bin && cp -a {backup_bin} {app_real_path}/bin && echo 'OK' || echo 'FAILED'; "
                cmd += f"else echo 'NO_BACKUP'; fi"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status == 0 and 'OK' in output:
                    self.logger.log(f"Bin directory restored on {dbNode.name}")
                elif 'NO_BACKUP' in output:
                    self.logger.warn(f"No bin backup found on {dbNode.name}, skipping bin restore")
                else:
                    self.logger.warn(f"Failed to restore bin directory on {dbNode.name}: {output}")
                
                # 恢复 lib 目录
                backup_lib = os.path.join(backup_dir, "lib")
                cmd = f"if [ -d {backup_lib} ]; then "
                cmd += f"  rm -rf {app_real_path}/lib && cp -a {backup_lib} {app_real_path}/lib && echo 'OK' || echo 'FAILED'; "
                cmd += f"else echo 'NO_BACKUP'; fi"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status == 0 and 'OK' in output:
                    self.logger.log(f"Lib directory restored on {dbNode.name}")
                elif 'NO_BACKUP' in output:
                    self.logger.warn(f"No lib backup found on {dbNode.name}, skipping lib restore")
                else:
                    self.logger.warn(f"Failed to restore lib directory on {dbNode.name}: {output}")
            
            # 2. 如果服务已停止,尝试启动服务
            if services_stopped and cm_ctl_path:
                node_id = clusterInfo.dbNodes.index(dbNode) + 1
                self.logger.log(f"Restarting CM services on node {node_id} ({dbNode.name})")
                
                if dbNode.name == GetHostIpOrName():
                    cmd = f"{env_source}{cm_ctl_path} start -n {node_id}"
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    cmd = f"ssh -q -o ConnectTimeout=5 {dbNode.sshIps[0]} '{env_source}{cm_ctl_path} start -n {node_id}'"
                    status, output = subprocess.getstatusoutput(cmd)
                
                if status != 0:
                    self.logger.warn(f"Failed to restart CM services on node {node_id} ({dbNode.name}): {output}")
                    self.logger.warn(f"Please manually restart: cm_ctl start -n {node_id}")
                else:
                    self.logger.log(f"CM services restarted successfully on node {node_id} ({dbNode.name})")
                
                time.sleep(1)
        
        self.logger.log("=" * 60)
        self.logger.log("CM ROLLBACK COMPLETED")
        self.logger.log("=" * 60)
        self.logger.log("All nodes have been rolled back to previous CM version")
        self.logger.log("Please verify the system status manually")

    def gr_cm_combined_upgrade(self, xmlFile, grPkgPath, cmPkgPath):
        """
        GR + CM 组合升级(一次停止、一次启动、失败全部回滚)
        
        升级流程:
          1. 停止整个集群(一次性)
          2. 在所有节点上备份当前版本
          3. 在所有节点上升级 GR(黑匣子)
          4. 在所有节点上升级 CM
          5. 启动整个集群(一次性)
          6. 检查状态
          7. 检查失败则全部回滚
        """
        user = getpass.getuser()
        gaussHome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if gaussHome == "":
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] %
                            ("installation path of designated user [%s]" % user))

        clusterInfo = dbClusterInfo()
        clusterInfo.initFromXml(xmlFile)
        self.initLogger("gr_cm_combined_upgrade")

        self.logger.log("=" * 60)
        self.logger.log("GR + CM COMBINED UPGRADE (single stop/start)")
        self.logger.log("=" * 60)

        nodes_to_upgrade = clusterInfo.dbNodes
        if not nodes_to_upgrade:
            raise Exception("No nodes found in cluster for upgrade")

        install_base = clusterInfo.installPath
        app_link_name = "APP"
        app_link_path = os.path.join(install_base, app_link_name)

        # 获取 cm_ctl 路径
        toolPath = self.__getEnvironmentParameterValue("GPHOME", user)
        cm_ctl_path = None
        if gaussHome:
            cm_ctl_path = os.path.join(gaussHome, "bin", "cm_ctl")
        elif toolPath:
            cm_ctl_path = os.path.join(toolPath, "bin", "cm_ctl")
        else:
            status, output = subprocess.getstatusoutput("which cm_ctl")
            if status == 0:
                cm_ctl_path = output.strip()

        if not cm_ctl_path:
            raise Exception("cm_ctl not found, cannot perform combined upgrade")

        self.logger.log(f"Using cm_ctl: {cm_ctl_path}")

        envfile_path = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        env_source = f"source {envfile_path} && " if envfile_path else ""

        # 时间戳用于版本目录和备份
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        version_str = timestamp
        app_version_dir = f"APP_{version_str}"
        app_version_path = os.path.join(install_base, app_version_dir)
        backup_dir = os.path.join(install_base, "backup", f"combined_backup_{timestamp}")

        gr_pkg_name = os.path.basename(grPkgPath)
        cm_pkg_name = os.path.basename(cmPkgPath)

        # 记录回滚信息
        node_rollback_info = {}
        
        # 预先记录所有节点的当前版本信息
        self.logger.log("Collecting current version info from all nodes...")
        for dbNode in nodes_to_upgrade:
            cmd = f"readlink -f {app_link_path} 2>/dev/null || echo ''"
            status, output = self.execute_on_node(
                dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
            if status == 0 and output.strip():
                current_version_dir = output.strip()
                current_version_name = os.path.basename(current_version_dir)
                node_rollback_info[dbNode.name] = {
                    'old_version_dir': current_version_dir,
                    'old_version_name': current_version_name,
                    'backup_dir': backup_dir,
                    'services_stopped': False,
                    'gr_upgraded': False,
                    'cm_upgraded': False,
                    'symlink_switched': False
                }
                self.logger.log(f"  {dbNode.name}: current version = {current_version_name}")
            else:
                raise Exception(f"Failed to resolve APP symlink on {dbNode.name}")

        # ===== 升级前检查(检查失败直接退出,不触发回滚)=====
        self.logger.log("Step 0: Pre-upgrade status check")
        try:
            self._check_cm_status(user, "pre-upgrade")
            self._check_gr_status(user, nodes_to_upgrade, "pre-upgrade")
        except Exception as e:
            self.logger.log("=" * 60)
            self.logger.log("PRE-UPGRADE CHECK FAILED!")
            self.logger.log("=" * 60)
            self.logger.log(f"Error: {str(e)}")
            self.logger.log("")
            self.logger.log("The cluster is not in a healthy state for upgrade.")
            self.logger.log("Please fix the above issues and retry the upgrade.")
            self.logger.log("No changes have been made to the cluster.")
            self.logger.log("=" * 60)
            raise Exception(f"Pre-upgrade check failed: {str(e)}")

        # 升级成功标志
        upgrade_success = False

        try:
            # ===== Step 1: 分发安装包到所有节点 =====
            self.logger.log("Step 1: Distributing packages to all nodes...")
            for dbNode in nodes_to_upgrade:
                if dbNode.name != GetHostIpOrName():
                    # 分发 GR 包(始终覆盖,确保使用新版本)
                    gr_target = os.path.join(clusterInfo.toolPath, gr_pkg_name)
                    self.logger.log(f"  Distributing GR package to {dbNode.name}")
                    cmd = f"scp {grPkgPath} {dbNode.sshIps[0]}:{gr_target}"
                    status, output = subprocess.getstatusoutput(cmd)
                    if status != 0:
                        raise Exception(f"Failed to distribute GR package to {dbNode.name}: {output}")
                    
                    # 分发 CM 包(始终覆盖,确保使用新版本)
                    cm_target = os.path.join(clusterInfo.toolPath, cm_pkg_name)
                    self.logger.log(f"  Distributing CM package to {dbNode.name}")
                    cmd = f"scp {cmPkgPath} {dbNode.sshIps[0]}:{cm_target}"
                    status, output = subprocess.getstatusoutput(cmd)
                    if status != 0:
                        raise Exception(f"Failed to distribute CM package to {dbNode.name}: {output}")
            self.logger.log("  Package distribution completed")

            # ===== Step 2: 停止整个集群 =====
            self.logger.log("Step 2: Stopping entire cluster (single stop)...")
            cmd = f"{env_source}{cm_ctl_path} stop"
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                self.logger.warn(f"  cm_ctl stop returned non-zero: {output}")
            else:
                self.logger.log("  Cluster stopped successfully")
            
            # 标记所有节点服务已停止
            for dbNode in nodes_to_upgrade:
                node_rollback_info[dbNode.name]['services_stopped'] = True
            
            time.sleep(3)

            # ===== Step 3: 在所有节点上备份并升级 =====
            self.logger.log("Step 3: Backup and upgrade on all nodes...")
            
            for dbNode in nodes_to_upgrade:
                rollback_info = node_rollback_info[dbNode.name]
                old_version_dir = rollback_info['old_version_dir']
                
                self.logger.log(f"  [{dbNode.name}] Starting upgrade...")
                
                # 3.1 创建备份目录
                cmd = f"mkdir -p {backup_dir}"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 3.2 备份当前版本的 bin/lib
                self.logger.log(f"  [{dbNode.name}] Backing up current bin/lib...")
                cmd = f"cp -a {old_version_dir}/bin {backup_dir}/bin 2>/dev/null && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    self.logger.warn(f"  [{dbNode.name}] Backup bin failed: {output}")
                
                cmd = f"cp -a {old_version_dir}/lib {backup_dir}/lib 2>/dev/null && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    self.logger.warn(f"  [{dbNode.name}] Backup lib failed: {output}")
                
                # 3.3 创建新版本目录(从旧版本复制)
                self.logger.log(f"  [{dbNode.name}] Creating new version directory...")
                cmd = f"test -d {app_version_path} && rm -rf {app_version_path} || true"
                self.execute_on_node(dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                cmd = f"cp -a {old_version_dir} {app_version_path}"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if status != 0:
                    raise Exception(f"Failed to create new version directory on {dbNode.name}: {output}")
                
                # 3.4 升级 GR(黑匣子)
                self.logger.log(f"  [{dbNode.name}] Upgrading GR (blackbox)...")
                local_gr_pkg = grPkgPath if dbNode.name == GetHostIpOrName() else os.path.join(clusterInfo.toolPath, gr_pkg_name)
                temp_gr_dir = os.path.join(clusterInfo.toolPath, f"temp_gr_{timestamp}")
                
                cmd = f"rm -rf {temp_gr_dir} && mkdir -p {temp_gr_dir} && tar -zxf {local_gr_pkg} -C {temp_gr_dir} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    raise Exception(f"Failed to extract GR package on {dbNode.name}: {output}")
                
                # 覆盖 GR 文件
                cmd = (
                    f"if [ -d {temp_gr_dir}/bin ]; then cp -rf {temp_gr_dir}/bin/* {app_version_path}/bin/ 2>/dev/null || cp -rf {temp_gr_dir}/bin {app_version_path}/; fi; "
                    f"if [ -d {temp_gr_dir}/lib ]; then cp -rf {temp_gr_dir}/lib/* {app_version_path}/lib/ 2>/dev/null || cp -rf {temp_gr_dir}/lib {app_version_path}/; fi; "
                    f"if [ -d {temp_gr_dir}/share ]; then cp -rf {temp_gr_dir}/share/* {app_version_path}/share/ 2>/dev/null || cp -rf {temp_gr_dir}/share {app_version_path}/; fi; "
                    f"rm -rf {temp_gr_dir} && echo 'OK' || echo 'FAILED'"
                )
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    raise Exception(f"Failed to overlay GR files on {dbNode.name}: {output}")
                
                node_rollback_info[dbNode.name]['gr_upgraded'] = True
                
                # 3.5 升级 CM
                self.logger.log(f"  [{dbNode.name}] Upgrading CM...")
                local_cm_pkg = cmPkgPath if dbNode.name == GetHostIpOrName() else os.path.join(clusterInfo.toolPath, cm_pkg_name)
                temp_cm_dir = os.path.join(clusterInfo.toolPath, f"temp_cm_{timestamp}")
                
                cmd = f"rm -rf {temp_cm_dir} && mkdir -p {temp_cm_dir} && tar -zxf {local_cm_pkg} -C {temp_cm_dir} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    raise Exception(f"Failed to extract CM package on {dbNode.name}: {output}")
                
                # 覆盖 CM 文件
                cmd = (
                    f"if [ -d {temp_cm_dir}/bin ]; then cp -rf {temp_cm_dir}/bin/* {app_version_path}/bin/ 2>/dev/null || cp -rf {temp_cm_dir}/bin {app_version_path}/; fi; "
                    f"if [ -d {temp_cm_dir}/lib ]; then cp -rf {temp_cm_dir}/lib/* {app_version_path}/lib/ 2>/dev/null || cp -rf {temp_cm_dir}/lib {app_version_path}/; fi; "
                    f"rm -rf {temp_cm_dir} && echo 'OK' || echo 'FAILED'"
                )
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    raise Exception(f"Failed to overlay CM files on {dbNode.name}: {output}")
                
                node_rollback_info[dbNode.name]['cm_upgraded'] = True
                
                # 3.6 修复权限
                perm_cmds = [
                    f"find {app_version_path}/bin -type f -exec chmod 755 {{}} \\; 2>/dev/null || true",
                    f"find {app_version_path}/lib -type f -exec chmod 644 {{}} \\; 2>/dev/null || true",
                ]
                for pcmd in perm_cmds:
                    self.execute_on_node(dbNode.name, pcmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                
                # 3.7 切换软链接
                self.logger.log(f"  [{dbNode.name}] Switching symlink to new version...")
                cmd = f"rm -f {app_link_path} && ln -s {app_version_path} {app_link_path} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'FAILED' in output:
                    raise Exception(f"Failed to switch symlink on {dbNode.name}: {output}")
                
                node_rollback_info[dbNode.name]['symlink_switched'] = True
                self.logger.log(f"  [{dbNode.name}] Upgrade completed")

            # ===== Step 4: 启动整个集群 =====
            self.logger.log("Step 4: Starting entire cluster (single start)...")
            cmd = f"{env_source}{cm_ctl_path} start"
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                self.logger.warn(f"  cm_ctl start returned non-zero: {output}")
            else:
                self.logger.log("  Cluster started successfully")
            
            # 等待服务完全启动
            self.logger.log("  Waiting for services to fully start...")
            time.sleep(10)

            # ===== Step 5: 升级后检查 =====
            self.logger.log("Step 5: Post-upgrade status check...")
            self._check_cm_status(user, "post-upgrade")
            self._check_gr_status(user, nodes_to_upgrade, "post-upgrade")

            # 升级成功
            upgrade_success = True
            self.logger.log("=" * 60)
            self.logger.log("GR + CM COMBINED UPGRADE COMPLETED SUCCESSFULLY!")
            self.logger.log(f"New version: {app_version_dir}")
            self.logger.log("=" * 60)

        except Exception as e:
            # 升级失败,执行全部回滚
            self.logger.log("=" * 60)
            self.logger.log("COMBINED UPGRADE FAILED! Starting full rollback...")
            self.logger.log("=" * 60)
            self.logger.log(f"Error: {str(e)}")

            try:
                self._rollback_combined_upgrade(clusterInfo, nodes_to_upgrade, node_rollback_info,
                                               app_link_path, cm_ctl_path, env_source)
            except Exception as rollback_error:
                self.logger.log("=" * 60)
                self.logger.log("ROLLBACK FAILED! Manual intervention required!")
                self.logger.log("=" * 60)
                self.logger.log(f"Rollback error: {str(rollback_error)}")
                self.logger.log("Please manually restore the system using backups in:")
                for dbNode in nodes_to_upgrade:
                    rollback_info = node_rollback_info.get(dbNode.name, {})
                    self.logger.log(f"  {dbNode.name}: {rollback_info.get('backup_dir', 'UNKNOWN')}")

            raise

        if not upgrade_success:
            raise Exception("Combined upgrade process completed but success flag not set")

    def _rollback_combined_upgrade(self, clusterInfo, nodes_to_upgrade, node_rollback_info,
                                   app_link_path, cm_ctl_path, env_source):
        """
        GR + CM 组合升级回滚:恢复所有节点的软链接和 bin/lib,重启服务
        """
        self.logger.log("=" * 60)
        self.logger.log("COMBINED ROLLBACK: Restoring all nodes to previous version")
        self.logger.log("=" * 60)

        for dbNode in nodes_to_upgrade:
            rollback_info = node_rollback_info.get(dbNode.name, {})
            old_version_dir = rollback_info.get('old_version_dir')
            backup_dir = rollback_info.get('backup_dir')
            symlink_switched = rollback_info.get('symlink_switched', False)
            gr_upgraded = rollback_info.get('gr_upgraded', False)
            cm_upgraded = rollback_info.get('cm_upgraded', False)

            self.logger.log(f"Rolling back node {dbNode.name}...")

            # 1. 恢复软链接到旧版本
            if symlink_switched and old_version_dir:
                self.logger.log(f"  [{dbNode.name}] Restoring symlink to {old_version_dir}")
                cmd = f"rm -f {app_link_path} && ln -s {old_version_dir} {app_link_path} && echo 'OK' || echo 'FAILED'"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'OK' in output:
                    self.logger.log(f"  [{dbNode.name}] Symlink restored")
                else:
                    self.logger.warn(f"  [{dbNode.name}] Failed to restore symlink: {output}")

            # 2. 如果 bin/lib 已被修改,从备份恢复
            if (gr_upgraded or cm_upgraded) and backup_dir and old_version_dir:
                self.logger.log(f"  [{dbNode.name}] Restoring bin/lib from backup")
                
                # 恢复 bin
                backup_bin = os.path.join(backup_dir, "bin")
                cmd = f"if [ -d {backup_bin} ]; then rm -rf {old_version_dir}/bin && cp -a {backup_bin} {old_version_dir}/bin && echo 'OK' || echo 'FAILED'; else echo 'NO_BACKUP'; fi"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'OK' in output:
                    self.logger.log(f"  [{dbNode.name}] Bin restored")
                elif 'NO_BACKUP' not in output:
                    self.logger.warn(f"  [{dbNode.name}] Failed to restore bin: {output}")
                
                # 恢复 lib
                backup_lib = os.path.join(backup_dir, "lib")
                cmd = f"if [ -d {backup_lib} ]; then rm -rf {old_version_dir}/lib && cp -a {backup_lib} {old_version_dir}/lib && echo 'OK' || echo 'FAILED'; else echo 'NO_BACKUP'; fi"
                status, output = self.execute_on_node(
                    dbNode.name, cmd, local_name=GetHostIpOrName(), ssh_ip=dbNode.sshIps[0])
                if 'OK' in output:
                    self.logger.log(f"  [{dbNode.name}] Lib restored")
                elif 'NO_BACKUP' not in output:
                    self.logger.warn(f"  [{dbNode.name}] Failed to restore lib: {output}")

        # 3. 重启整个集群
        self.logger.log("Restarting cluster after rollback...")
        if cm_ctl_path:

            cmd = f"{env_source}{cm_ctl_path} stop"
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                self.logger.warn(f"Failed to stop cluster: {output}")
            else:
                self.logger.log("Cluster stopped successfully")

            cmd = f"{env_source}{cm_ctl_path} start"
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                self.logger.warn(f"Failed to restart cluster: {output}")
                self.logger.warn("Please manually restart: cm_ctl start")
            else:
                self.logger.log("Cluster restarted successfully")

        self.logger.log("=" * 60)
        self.logger.log("COMBINED ROLLBACK COMPLETED")
        self.logger.log("=" * 60)
        self.logger.log("All nodes have been rolled back to previous version")
        self.logger.log("Please verify the system status manually")

    def preinstall(self, config_file, user, envfile):
        """
        Preinstall oGRecorder cluster using JSON configuration file
        """
        # Read JSON configuration file
        if not os.path.exists(config_file):
            print(f"Configuration file {config_file} does not exist!")
            sys.exit(1)
        
        try:
            with open(config_file, 'r', encoding='utf-8') as f:
                config = json.load(f)
        except Exception as e:
            print(f"Failed to read JSON configuration file: {e}")
            sys.exit(1)
        
        # Extract cluster information from JSON config
        if 'cluster' not in config:
            print("Missing cluster configuration in JSON file!")
            sys.exit(1)
        
        cluster_config = config['cluster']
        nodes = cluster_config.get('nodes', [])
        
        if not nodes:
            print("No nodes found in cluster configuration!")
            sys.exit(1)
        
        # Initialize logger
        clusterInfo = dbClusterInfo()
        clusterInfo.initLogger("preinstall")
        logger = clusterInfo.logger
        
        # Set paths based on JSON configuration
        install_path = cluster_config.get('install_path', '')
        worm_path = cluster_config.get('worm_path', '')
        ca_path = cluster_config.get('ca_path', '')
        clusterInfo.installPath = install_path
        clusterInfo.toolPath = os.path.join(install_path, "tool")
        clusterInfo.grPath = os.path.join(install_path, "gr")
        clusterInfo.appPath = os.path.join(install_path, "APP")
        clusterInfo.appSoftPath = clusterInfo.appPath
        clusterInfo.logPath = os.path.join(install_path, "log")
        clusterInfo.tmpPath = os.path.join(install_path, "tmp")
        clusterInfo.wormPath = worm_path
        clusterInfo.caPath = ca_path
        datadnPath = os.path.join(install_path, "data/dn") # cm need "dn" to check disk, maybe need to change
        logger.log("Start preinstall oGRecorder.")
        
        # Create directories
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            # Create basic directories
            cmd = (
                f"mkdir -p {clusterInfo.installPath} "
                f"{clusterInfo.toolPath} "
                f"{clusterInfo.grPath} "
                f"{clusterInfo.logPath} "
                f"{clusterInfo.tmpPath} "
                f"{datadnPath} -m 755"
            )
            status, output = self.execute_on_node(
                node_name, cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                raise Exception(f"failed to create preinstall directory on {node_name}, Error output:\n{output}")
        logger.log("Successfully create preinstall directory.")

        # Change permissions
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            # 先设置wormPath权限(root用户执行,可以修改权限)
            cmd = f'chown {user}:{user} {clusterInfo.wormPath} && chmod 777 {clusterInfo.wormPath}'
            status, output = self.execute_on_node(
                node_name, cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                logger.warn(f"Failed to set wormPath permissions on {node_name}: {output}")
                logger.warn(f"wormPath may be read-only or NFS mounted, continuing...")
            else:
                logger.log(f"Successfully set wormPath permissions on {node_name}")
            
            # 创建磁盘文件并设置权限(root用户执行)
            sharedisk_path = os.path.join(clusterInfo.wormPath, "sharedisk")
            votingdisk_path = os.path.join(clusterInfo.wormPath, "votingdisk")
            cmd = f"truncate -s 1G {sharedisk_path} && truncate -s 1G {votingdisk_path} && chmod 777 {sharedisk_path} && chmod 777 {votingdisk_path}"
            status, output = self.execute_on_node(
                node_name, cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                logger.warn(f"Failed to create and set permissions for disk files on {node_name}: {output}")
                logger.warn(f"Continuing installation...")
            else:
                logger.log(f"Successfully created and set permissions for disk files on {node_name}")
            
            # 设置其他目录权限
            cmd = (
                f'chown -R {user}:{user} '
                f'{clusterInfo.installPath} '
                f'{clusterInfo.toolPath} '
                f'{clusterInfo.grPath} '
                f'{clusterInfo.logPath} '
                f'{clusterInfo.tmpPath} '
                f'{datadnPath}'
            )
            status, output = self.execute_on_node(
                node_name, cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                raise Exception(f"failed to change permissions on {node_name}, Error output:\n{output}")
        logger.log("Successfully change permissions.")

        # Create environment variable file
        parent_dir = os.path.dirname(envfile)
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            if (node_name != GetHostIpOrName()):
                cmd = f"ssh -q -o ConnectTimeout=5 {node_name} 'mkdir -p {parent_dir} && touch {envfile}'"
            else:
                cmd = f'mkdir -p {parent_dir} && touch {envfile}'
            (status, output) = subprocess.getstatusoutput(cmd)
            if (status != 0):
                raise Exception(f"failed to create environment variable file on {node_name}, Error output:\n{output}")
        
        # Configure environment variables
        # GR_HOME 指向共享目录,但 PATH 和 LD_LIBRARY_PATH 指向 APP 软链接(版本化目录)
        app_link_path = os.path.join(clusterInfo.installPath, "APP")
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            if (node_name != GetHostIpOrName()):
                cmd = f'''ssh -q -o ConnectTimeout=5 {node_name} "cat > {envfile} <<'EOF'
export MPPDB_ENV_SEPARATE_PATH={envfile}
export GPHOME={clusterInfo.toolPath}
export GAUSSHOME={app_link_path}
export GAUSSLOG={clusterInfo.logPath}
export GR_HOME={clusterInfo.grPath}
export PGHOST={clusterInfo.tmpPath}
export PATH={app_link_path}/bin/:\\$PATH
export LD_LIBRARY_PATH={app_link_path}/lib/:\\$LD_LIBRARY_PATH
EOF"
'''
            else:
                cmd = f'''cat > {envfile} <<'EOF'
export MPPDB_ENV_SEPARATE_PATH={envfile}
export GPHOME={clusterInfo.toolPath}
export GAUSSHOME={app_link_path}
export GAUSSLOG={clusterInfo.logPath}
export GR_HOME={clusterInfo.grPath}
export PGHOST={clusterInfo.tmpPath}
export PATH={app_link_path}/bin/:\\$PATH
export LD_LIBRARY_PATH={app_link_path}/lib/:\\$LD_LIBRARY_PATH
EOF
'''
            (status, output) = subprocess.getstatusoutput(cmd)
            if (status != 0):
                raise Exception(f"failed to add environment variable on {node_name}, Error output:\n{output}")
        logger.log("Successfully add environment variable.")

        # Change permissions for environment file
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            cmd = f'chown -R {user}:{user} {parent_dir}'
            status, output = self.execute_on_node(
                node_name, cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                raise Exception(f"failed to change permissions on {node_name}, Error output:\n{output}")
        logger.log("Successfully change permissions.")

        # Add crontab permission
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            cmd = f"grep -q '^{user}$' /etc/cron.allow || echo '{user}' >> /etc/cron.allow"
            status, output = self.execute_on_node(
                node_name, cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                raise Exception(f"Failed to add user {user} to /etc/cron.allow on node {node_name}")
        logger.log("Successfully add crontab permission.")

        # Set /etc/security/limits.conf resource limits
        limits_conf_lines = [
            f"{user}       soft    as  unlimited",
            f"{user}       hard    as  unlimited",
            f"{user}       soft    nproc  unlimited",
            f"{user}       hard    nproc  unlimited",
            f"{user}       soft    nofile 1000000",
            f"{user}       hard    nofile 1000000",
        ]
        limits_conf_cmd = ""
        for line in limits_conf_lines:
            limits_conf_cmd += f"grep -q \"^{line}\" /etc/security/limits.conf || echo \"{line}\" >> /etc/security/limits.conf; "
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            status, output = self.execute_on_node(
                node_name, limits_conf_cmd, local_name=GetHostIpOrName(), ssh_ip=node_name)
            if status != 0:
                raise Exception(f"Failed to set limits.conf for user {user} on node {node_name}, Error output:\n{output}")
        logger.log("Successfully set limits.conf.")
        logger.log("Successfully preinstall oGRecorder.")

    def uninstall(self, config_file):
        """
        Uninstall oGRecorder from JSON configuration file
        """

        RED = '\033[0;31m'
        GREEN = '\033[0;32m'
        YELLOW = '\033[0;33m'
        NC = '\033[0m'

        # Read JSON configuration file
        if not os.path.exists(config_file):
            print(f"{RED}Configuration file {config_file} does not exist!{NC}")
            sys.exit(1)
        
        try:
            with open(config_file, 'r', encoding='utf-8') as f:
                config = json.load(f)
        except Exception as e:
            print(f"{RED}Failed to read JSON configuration file: {e}{NC}")
            sys.exit(1)
        
        # Validate configuration file format
        if 'cluster' not in config or 'nodes' not in config['cluster']:
            print(f"{RED}Missing cluster.nodes configuration in JSON file!{NC}")
            sys.exit(1)
        
        # Get cluster information
        cluster_name = config['cluster'].get('name', 'cluster')
        install_path = config['cluster'].get('install_path', '/home/czk/install/')
        nodes = config['cluster']['nodes']

        if 'worm_path' not in config['cluster']:
            print(f"{RED}Missing worm_path configuration in cluster section!{NC}")
            sys.exit(1)
        worm_path = config['cluster']['worm_path']

        if not nodes:
            print(f"{RED}Node list is empty!{NC}")
            sys.exit(1)
        
        print(f"{GREEN}Starting to uninstall oGRecorder cluster: {cluster_name}{NC}")
        print(f"{GREEN}Installation path: {install_path}{NC}")
        print(f"{GREEN}Number of nodes: {len(nodes)}{NC}")
        
        # Initialize logging module
        self.initLogger("uninstall")
        self.logger.log("Start uninstall oGrecorder.")
        
        # Get current user
        user = getpass.getuser()
        envfile = self.__getEnvironmentParameterValue("MPPDB_ENV_SEPARATE_PATH", user)
        if (envfile == ""):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % \
                            ("installation path of designated user [%s]" % user))

        # stop cluster
        gausshome = self.__getEnvironmentParameterValue("GAUSSHOME", user)
        if os.path.exists(os.path.join(gausshome, "bin", "cm_ctl")):
            stop_cmd = "cm_ctl stop"
            status, output = subprocess.getstatusoutput(stop_cmd)
            if status != 0:
                error_msg = f"failed to stop oGRecorder cluster. Error output:\n{output}"
                print(f"{RED}{error_msg}{NC}")
                raise Exception(error_msg)
            print(f"{GREEN}Successfully stop cluster{NC}")

        # rm sharedisk and votingdisk
        cmd = f"rm -rf {worm_path}/sharedisk {worm_path}/votingdisk"
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            error_msg = f"failed to delete cm file. Error output:\n{output}"
            print(f"{RED}{error_msg}{NC}")
            raise Exception(error_msg)
        print(f"{GREEN}Successfully delete cm file{NC}")

        # kill cmrestapi
        cmrestapi_path = os.path.join(gausshome, "bin", "cmrestapi-7.0.0-RC2-RELEASE.jar")
        if os.path.exists(cmrestapi_path):
            for node in nodes:
                node_name = node['name']
                node_ip = node['ip']

                print(f"{GREEN}[{node_name}] Starting to stop restapi...{NC}")

                # Build command - $2 in single quotes won't be interpreted by shell
                cmd = f"ps aux | grep 'cmrestapi-7.0.0-RC2-RELEASE.jar' | grep -v grep | awk '{{print $2}}' | xargs kill -9 2>/dev/null"
                if node_ip == socket.gethostbyname(socket.gethostname()):
                    # Local node
                    status, output = subprocess.getstatusoutput(cmd)
                else:
                    # Remote node - escape single quotes for SSH
                    escaped_cmd = cmd.replace("'", "'\\''")
                    ssh_cmd = f"ssh -q -o ConnectTimeout=5 {user}@{node_ip} '{escaped_cmd}'"
                    status, output = subprocess.getstatusoutput(ssh_cmd)

        # Clean directories on each node
        for node in nodes:
            node_name = node['name']
            node_ip = node['ip']

            print(f"{GREEN}[{node_name}] Starting to clean node...{NC}")

            # Build cleanup command
            cmd = f'rm -rf {install_path}/* {install_path}/log/* {envfile}'

            # Execute cleanup command
            if node_ip == socket.gethostbyname(socket.gethostname()):
                # Local node
                status, output = subprocess.getstatusoutput(cmd)
            else:
                # Remote node
                ssh_cmd = f"ssh -q -o ConnectTimeout=5 {user}@{node_ip} '{cmd}'"
                status, output = subprocess.getstatusoutput(ssh_cmd)

            if status != 0:
                error_msg = f"failed to clean oGRecorder directory on {node_name}. Error output:\n{output}"
                print(f"{RED}{error_msg}{NC}")
                raise Exception(error_msg)

            print(f"{GREEN}[{node_name}] Node cleanup completed{NC}")

        self.logger.log("Successfully uninstall oGRecorder.")
        print(f"{GREEN}oGRecorder uninstallation completed!{NC}")

    def distribute_gr_certs(self, config_file, envFile):
        """
        Generate GR certificates and distribute to all nodes, then reload certificates
        """
        # Read JSON configuration file
        if not os.path.exists(config_file):
            print(f"Configuration file {config_file} does not exist!")
            sys.exit(1)
        
        try:
            with open(config_file, 'r', encoding='utf-8') as f:
                config = json.load(f)
        except Exception as e:
            print(f"Failed to read JSON configuration file: {e}")
            sys.exit(1)
        
        # Extract cluster information from JSON config
        if 'cluster' not in config:
            print("Missing cluster configuration in JSON file!")
            sys.exit(1)
        
        cluster_config = config['cluster']
        nodes = cluster_config.get('nodes', [])
        
        if not nodes:
            print("No nodes found in cluster configuration!")
            sys.exit(1)
        
        # Initialize logger
        clusterInfo = dbClusterInfo()
        clusterInfo.initLogger("gr_certs")
        logger = clusterInfo.logger
        
        # Get current user
        user = getpass.getuser()
        
        # Set paths based on JSON configuration
        install_path = cluster_config.get('install_path', '/home/czk/install/')
        worm_path = cluster_config.get('worm_path', '/home/czk/data/')
        clusterInfo.installPath = install_path
        clusterInfo.grPath = os.path.join(install_path, "gr")
        clusterInfo.wormPath = worm_path  # 保存worm_path到clusterInfo中
        
        # Certificate directory, avoid duplicate CA concatenation
        ca_path = clusterInfo.grPath
        if not ca_path.endswith("CA"):
            ca_path = os.path.join(ca_path, "CA")
        
        # Create certificate directory if it doesn't exist
        if not os.path.exists(ca_path):
            logger.log(f"Creating certificate directory: {ca_path}")
            os.makedirs(ca_path, exist_ok=True)
        
        # Generate certificates
        logger.log("Start generate gr certs.")
        logger.log(f"Certificate path: {ca_path}")
        gen_cert_cmd = f"source {envFile} && grcmd gencert -t ca -d 1000 && grcmd gencert -t server -d 1000 && grcmd gencert -t client -d 1000"
        status, output = subprocess.getstatusoutput(gen_cert_cmd)
        if status != 0:
            logger.logExit(f"failed to create gr cert, Error output:\n{output}")
        logger.log("Successfully generate gr certs.")
        
        # Verify certificates were created
        if not os.path.exists(ca_path) or not os.listdir(ca_path):
            logger.logExit(f"Certificate directory {ca_path} is empty or does not exist after generation")
        logger.log(f"Certificates generated successfully in: {ca_path}")

        # Fix local permissions to ensure scp works
        chmod_cmd = f"chmod -R 755 {ca_path}"
        subprocess.getstatusoutput(chmod_cmd)

        # Fix remote permissions in advance to prevent scp failure
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            if node_name != GetHostIpOrName():
                # Fix remote permissions first
                fix_perm_cmd = f"ssh -q -o ConnectTimeout=5 {node_name} 'mkdir -p {ca_path}; chmod -R 755 {ca_path}'"
                subprocess.getstatusoutput(fix_perm_cmd)

        # Distribute certificates
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            if node_name != GetHostIpOrName():
                logger.log(f"Distributing certificates to node: {node_name}")
                cmd = f"scp -r {ca_path}/* {node_name}:{ca_path}/"
                status, output = subprocess.getstatusoutput(cmd)
                if status != 0:
                    logger.logExit(f"failed to distribute gr cert to {node_name}, Error output:\n{output}")
                logger.log(f"Successfully distributed certificates to {node_name}")
            else:
                logger.log(f"Skipping local node: {node_name}")
        logger.log("Successfully distribute gr certs.")

        # Restore permissions for top-level files under CA path (skip demoCA directory)
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            # only chmod files directly under CA dir, do not touch demoCA subtree
            perm_cmd = f"find {ca_path} -maxdepth 1 -type f -exec chmod 400 {{}} +"
            if node_name != GetHostIpOrName():
                cmd = f"ssh -q -o ConnectTimeout=5 {node_name} '{perm_cmd}'"
            else:
                cmd = perm_cmd
            subprocess.getstatusoutput(cmd)

        # Reload certificates
        for node in nodes:
            node_name = node.get('ip', node.get('name', ''))
            reload_cmd = f"source {envFile} && grcmd reload_certs"
            if node_name != GetHostIpOrName():
                cmd = f"ssh -q -o ConnectTimeout=5 {node_name} '{reload_cmd}'"
            else:
                cmd = reload_cmd
            status, output = subprocess.getstatusoutput(cmd)
            if status != 0:
                logger.logExit(f"failed to reload gr cert on {node_name}, Error output:\n{output}")
        logger.log("Successfully reload gr certs on all nodes.")

    def setup_ssh_trust(self, config_file):
        RED = '\033[0;31m'
        GREEN = '\033[0;32m'
        YELLOW = '\033[0;33m'
        NC = '\033[0m'

        # Read host list from JSON config
        if not os.path.exists(config_file):
            print(f"{RED}Configuration file {config_file} does not exist!{NC}")
            sys.exit(1)
        
        try:
            with open(config_file, 'r', encoding='utf-8') as f:
                config = json.load(f)
        except Exception as e:
            print(f"{RED}Failed to read JSON configuration file: {e}{NC}")
            sys.exit(1)
        
        # Extract hosts from JSON config
        if 'cluster' not in config or 'nodes' not in config['cluster']:
            print(f"{RED}Missing cluster.nodes configuration in JSON file!{NC}")
            sys.exit(1)
        
        hosts = [node['ip'] for node in config['cluster']['nodes']]
        hosts = sorted(set(hosts))
        if not hosts:
            print(f"{RED}Host list is empty!{NC}")
            sys.exit(1)
        admin_node = socket.gethostname()
        if admin_node not in hosts:
            print(f"{YELLOW}Host list does not include current node({admin_node}), adding automatically...{NC}")
            hosts = [admin_node] + hosts

        key_type = "ed25519"
        ssh_port = 22
        user = getpass.getuser()
        tmp_dir = tempfile.mkdtemp(prefix="ssh_trust_")

        # 1. Generate SSH keys
        for host in hosts:
            print(f"{GREEN}[{host}] Generating SSH keys...{NC}")
            if host == admin_node:
                key_path = os.path.expanduser(f"~/.ssh/id_{key_type}")
                if not os.path.exists(key_path):
                    os.system(f'ssh-keygen -t {key_type} -f {key_path} -N "" -q')
            else:
                os.system(f'ssh -p {ssh_port} {user}@{host} "if [ ! -f ~/.ssh/id_{key_type} ]; then ssh-keygen -t {key_type} -f ~/.ssh/id_{key_type} -N \'\' -q; fi"')

        # 2. Collect public keys
        pub_keys = []
        for host in hosts:
            if host == admin_node:
                pub_path = os.path.expanduser(f"~/.ssh/id_{key_type}.pub")
                with open(pub_path, "r") as f:
                    pub_keys.append(f.read().strip())
            else:
                tmp_pub = os.path.join(tmp_dir, f"{host}.pub")
                os.system(f"scp -P {ssh_port} {user}@{host}:~/.ssh/id_{key_type}.pub {tmp_pub} 2>/dev/null")
                if os.path.exists(tmp_pub):
                    with open(tmp_pub, "r") as f:
                        pub_keys.append(f.read().strip())
        all_keys = "\n".join(pub_keys)

        # 3. Distribute authorized_keys and known_hosts
        # Generate known_hosts
        known_hosts_path = os.path.join(tmp_dir, "known_hosts")
        with open(known_hosts_path, "w") as f:
            for host in hosts:
                os.system(f"ssh-keyscan -p {ssh_port} -H {host} >> {known_hosts_path} 2>/dev/null")
        with open(known_hosts_path, "r") as f:
            known_hosts_content = f.read()

        for host in hosts:
            print(f"{GREEN}[{host}] Configuring authorized_keys and known_hosts...{NC}")
            if host == admin_node:
                auth_path = os.path.expanduser("~/.ssh/authorized_keys")
                with open(auth_path, "a") as f:
                    f.write("\n" + all_keys + "\n")
                with open(os.path.expanduser("~/.ssh/known_hosts"), "a") as f:
                    f.write(known_hosts_content)
                os.system("chmod 600 ~/.ssh/authorized_keys")
            else:
                tmp_auth = os.path.join(tmp_dir, "all_keys")
                with open(tmp_auth, "w") as f:
                    f.write(all_keys)
                os.system(f"scp -P {ssh_port} {tmp_auth} {known_hosts_path} {user}@{host}:~/")
                os.system(f'''ssh -p {ssh_port} {user}@{host} "mkdir -p ~/.ssh; cat ~/all_keys >> ~/.ssh/authorized_keys; cat ~/known_hosts >> ~/.ssh/known_hosts; rm -f ~/all_keys ~/known_hosts; chmod 700 ~/.ssh; chmod 600 ~/.ssh/authorized_keys"''')

        # 4. Verify mutual trust
        print(f"{GREEN}Verifying mutual trust between nodes...{NC}")
        for src in hosts:
            for dst in hosts:
                if src == dst:
                    continue
                if src == admin_node:
                    ret = os.system(f"ssh -p {ssh_port} {dst} 'echo -n' &>/dev/null")
                else:
                    ret = os.system(f"ssh -p {ssh_port} {src} \"ssh -p {ssh_port} {dst} 'echo -n'\" &>/dev/null")
                if ret == 0:
                    print(f"[{src} → {dst}] {GREEN}Success{NC}")
                else:
                    print(f"[{src} → {dst}] {RED}Failed{NC}")

        shutil.rmtree(tmp_dir)
        print(f"{GREEN}SSH mutual trust configuration completed for all cluster nodes!{NC}")

class ErrorCode():
    """
    Class to define output about the error message
    """

    def __init__(self):
        pass

    @staticmethod
    def getErrorCodeAsInt(ex, default_error_code):
        """
        Resolve the exit code from the exception instance or error message.

        In linux, the range of return values is between 0 and 255.
        So we can only use each type of error code as exit code.Such as:
            ErrorCode.GAUSS_500 : 10
            ErrorCode.GAUSS_501 : 11

        :param ex:                  Exception instance or error message
        :param default_error_code:  If the exception instance does not contain
        the exit code, use this parameter.

        :type ex:                   Exception | str
        :type default_error_code:   int

        :return:    Return the error code.
            9 represents undefined exit code.
            other number between 0 and 255 represent the specific gauss error.
        :type:      int
        """
        error_message = str(ex)
        pattern = r"^[\S\s]*\[GAUSS-(\d+)\][\S\s]+$"
        match = re.match(pattern, error_message)

        if match is not None and len(match.groups()) == 1:
            error_code = int(match.groups()[0])
        else:
            error_code = default_error_code

        if 50000 < error_code < 60000:
            return error_code // 100 - 500 + 10
        else:
            return 9

    GAUSS_500 = {
        'GAUSS_50000': "[GAUSS-50000] : Unrecognized parameter: %s.",
        'GAUSS_50001': "[GAUSS-50001] : Incorrect parameter. Parameter '-%s' is required",
        'GAUSS_50011': "[GAUSS-50011] : The parameter[%s] value[%s] is invalid.",
        'GAUSS_50024': "[GAUSS-50024] : The parameter [%s] value is invalid.",
    }

    GAUSS_501 = {
        'GAUSS_50100': "[GAUSS-50100] : The %s is not readable for %s.",
        'GAUSS_50104': "[GAUSS-50104] : Only a user with the root permission can run this script.",
        'GAUSS_50105': "[GAUSS-50105] : Cannot run this script as a user with the root permission.",
    }

    GAUSS_502 = {
        'GAUSS_50200': "[GAUSS-50200] : The %s already exists.",
        'GAUSS_50201': "[GAUSS-50201] : The %s does not exist.",
        'GAUSS_50203': "[GAUSS-50203] : The %s cannot be empty.",
        'GAUSS_50204': "[GAUSS-50204] : Failed to read %s.",
        'GAUSS_50205': "[GAUSS-50205] : Failed to write %s.",
        'GAUSS_50206': "[GAUSS-50206] : The %s is a symbolic link.",
        'GAUSS_50208': "[GAUSS-50208] : Failed to create %s.",
        'GAUSS_50209': "[GAUSS-50209] : Failed to access %s.",
        'GAUSS_50210': "[GAUSS-50210] : The %s is not a regular file.",
        'GAUSS_50211': "[GAUSS-50211] : The %s is not a directory.",
        'GAUSS_50213': "[GAUSS-50213] : Failed to parse %s.",
        'GAUSS_50216': "[GAUSS-50216] : Failed to distribute %s.",
        'GAUSS_50219': "[GAUSS-50219] : The %s is invalid.",
        'GAUSS_50230': "[GAUSS-50230] : Failed to read/write %s.",
    }

    GAUSS_503 = {
        'GAUSS_50300': "[GAUSS-50300] : User %s does not exist.",
    }

    GAUSS_506 = {
        'GAUSS_50602': "[GAUSS-50602] : Failed to bind network adapters.",
        'GAUSS_50603': "[GAUSS-50603] : The IP address is invalid.",
    }

    GAUSS_508 = {
        'GAUSS_50801': "[GAUSS-50801] : Failed to set up tasks.",
    }

    GAUSS_511 = {
        'GAUSS_51100': "[GAUSS-51100] : Failed to verify SSH trust on these nodes: %s.",
    }

    GAUSS_512 = {
        'GAUSS_51200': "[GAUSS-51200] : The parameter [%s] in the XML file does not exist.",
        'GAUSS_51230': "[GAUSS-51230] : The number of %s must %s.",
    }

    GAUSS_514 = {
        'GAUSS_51400': "[GAUSS-51400] : Failed to execute command: %s",
    }

    GAUSS_516 = {
        'GAUSS_51600': "[GAUSS-51600] : Failed to check cluster status.",
        'GAUSS_51637': "[GAUSS-51637] : Data directory[%s] is conflicting.",
        'GAUSS_51638': "[GAUSS-51638] : Data directory[%s] is conflicting.",
        'GAUSS_51649': "[GAUSS-51649] : Capture exceptions '%s' : %s.",
        'GAUSS_51650': "[GAUSS-51650] : Unclassified exceptions: %s.",
    }

    GAUSS_518 = {
        'GAUSS_51800': "[GAUSS-51800] : The environmental variable %s is empty. or variable has exceeded maximum length",
        'GAUSS_51802': "[GAUSS-51802] : The environmental variable %s is empty or invalid.",
    }

    GAUSS_532 = {}

class OmError(BaseException):
    """
    Used to record OM exception information and support ErrorCode
    keywords as message information.
    """

    def __init__(self, _message, *args, **kwargs):
        """
        Initialize the OmError instance.

        :param _message:    The input error message, it can be the error
                            message string, or the ErrorCode keywords,
                            or the Exception instance.
        :param args:        The additional unnamed parameters that use
                            to format the error message.
        :param kwargs:      The additional named parameters that use to format
                            the error message or extend to other
                            functions.

        :type _message:     str | BaseException
        :type args:         str | int
        :type kwargs:       str | int
        """
        # If we catch an unhandled exception.
        if isinstance(_message, Exception):
            # Store the error code.
            self._errorCode = ""
            # Store the error message.
            self._message = self.__getErrorMessage(str(_message), args, kwargs)
            # If can not parse the error code.
            if not self._errorCode:
                # Store the error code.
                self._errorCode = "GAUSS_51649"
                # Store the error message.
                self._message = ErrorCode.GAUSS_516[self._errorCode] % (
                    type(_message).__name__, repr(_message))
        else:
            # Store the error code.
            self._errorCode = ""
            # Store the error message.
            self._message = self.__getErrorMessage(_message, args, kwargs)

        # Store the stack information.
        self._stackInfo = sys.exc_info()[2]

    @property
    def message(self):
        """
        Getter, get the error message.

        :return:    Return the error message.
        :rtype:     str
        """
        return self._message

    @property
    def errorCode(self):
        """
        Getter, get the error code.

        :return:    Return the error code.
        :rtype:     str
        """
        return self._errorCode

    def __getErrorMessage(self, _errorCode, args, kwargs):
        """
        Get error information through error code.

        :param _errorCode:  Error code.
        :param args:        Additional parameters.
        :param kwargs:      Additional parameters.

        :type _errorCode:   str
        :type args:         tuple
        :type kwargs:       dict | None

        :return:    Return the error message.
        :rtype:     str
        """
        # Get base error information through error code.
        pattern = r"^[\S\s]*\[(GAUSS-\d+)\][\S\s]+$"
        match = re.match(pattern, str(_errorCode))
        if match and len(match.groups()) == 1:
            self._errorCode = match.groups()[0]
            message = _errorCode
        else:
            self._errorCode = "GAUSS_51650"
            message = ErrorCode.GAUSS_516[self._errorCode] % _errorCode

        # Format parameter which type is "%(param)s".
        if kwargs:
            for key, value in kwargs.items():
                if value is not None:
                    message = message.replace("%(" + key + ")s", str(value))
                else:
                    message = message.replace("%(" + key + ")s", "'None'")

        # Format standard type parameters.
        if args:
            # Convert tuple to list.
            args = list(args)
            # Travel the list.
            for i, arg in enumerate(args):
                if arg is None:
                    args[i] = "'None'"
                else:
                    args[i] = str(arg)

            # Format the message.
            message %= tuple(args)

        return message

    def __str__(self):
        """
        Show this instance as a string.

        :return:    Return this instance as a string.
        :rtype:     str
        """
        return self.message

    def __repr__(self):
        """
        Show this instance as a string.

        :return:    Return this instance as a string.
        :rtype:     str
        """
        return self.__str__()


class InstallImpl:
    def __init__(self, install):
        self.cmpkg = install.cmpkg
        self.context = install
        self.envFile = install.envFile
        self.xmlFile = install.xmlFile
        self.cmDirs = install.cmDirs
        self.hostnames = install.hostnames
        self.gaussHome = install.gaussHome
        self.gaussLog = install.gaussLog
        self.toolPath = install.toolPath
        self.tmpPath = install.tmpPath
        self.localhostName = install.localhostName
        self.logger = install.logger
        self.clusterStopped = install.clusterStopped
        self.primaryTermAbnormal = install.primaryTermAbnormal
        self.restpkg = install.restpkg
        # 初始化 clusterInfo 用于主机名到 IP 的映射
        self.clusterInfo = None
        try:
            self.clusterInfo = dbClusterInfo()
            self.clusterInfo.initFromXml(self.xmlFile)
        except Exception:
            pass  # 如果初始化失败,继续使用原始主机名

    def executeCmdOnHost(self, host, cmd, isLocal = False):
        if host == self.localhostName:
            isLocal = True
        else:
            # 尝试从 clusterInfo 获取对应的 IP 地址
            try:
                if hasattr(self, 'clusterInfo') and self.clusterInfo:
                    for dbNode in self.clusterInfo.dbNodes:
                        if dbNode.name == host and hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                            host = dbNode.sshIps[0]
                            break
            except Exception:
                pass  # 如果获取失败,继续使用原始主机名
        return executeCmdOnHost(host, cmd, isLocal)

    def prepareCMPath(self):
        """
        create path: cmdir、cmdir/cm_server、cmdir/cm_agent
        """
        self.logger.log("Preparing CM path.")
        for (cmdir, host) in zip(self.cmDirs, self.hostnames):
            cmd = "mkdir -p {cmdir}/cm_server {cmdir}/cm_agent".format(cmdir=cmdir)
            status, output = self.executeCmdOnHost(host, cmd)
            if status != 0:
                self.logger.debug("Command: " + cmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit("Failed to create CM path." + errorDetail)

    def decompressCMPkg(self):
        self.logger.log("Decompressing CM pacakage.")
        if self.cmpkg == "":
            return
        # decompress cm pkg on localhost
        decompressCmd = "tar -zxf %s -C %s" % (self.cmpkg, self.gaussHome)
        status, output = subprocess.getstatusoutput(decompressCmd)
        if status != 0:
            self.logger.debug("Command: " + decompressCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit("Failed to decompress cm pacakage to on localhost." + errorDetail)

        # If the version of CM pacakage is inconsistent with that of gaussdb,
        # then exit. So no need to send CM pacakage to other nodes.
        # self.checkCMPkgVersion()

        # decompress cmpkg on other hosts
        cmpkgName = os.path.basename(self.cmpkg)
        for host in self.hostnames:
            if host == self.localhostName:
                continue
            # copy cm pacakage to other hosts
            # 尝试从 clusterInfo 获取对应的 IP 地址
            ssh_ip = host
            try:
                if hasattr(self, 'clusterInfo') and self.clusterInfo:
                    for dbNode in self.clusterInfo.dbNodes:
                        if dbNode.name == host and hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                            ssh_ip = dbNode.sshIps[0]
                            break
            except Exception:
                pass  # 如果获取失败,继续使用原始主机名
            
            if ":" in ssh_ip:
                ssh_ip = "[" + ssh_ip + "]"
            scpCmd = "scp %s %s:%s" % (self.cmpkg, ssh_ip, self.toolPath)
            status, output = subprocess.getstatusoutput(scpCmd)
            if status != 0:
                self.logger.debug("Command: " + scpCmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit(("Failed to send cm pacakage to %s." % ssh_ip) + errorDetail)
            pkgPath = os.path.join(self.toolPath, cmpkgName)
            decompressCmd = "tar -zxf %s -C %s" % (pkgPath, self.gaussHome)
            status, output = self.executeCmdOnHost(host, decompressCmd)
            if status != 0:
                self.logger.debug("Command: " + decompressCmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit(("Failed to decompress cm pacakage to on host %s." % host) + errorDetail)

    def checkCMPkgVersion(self):
        getCMVersionCmd = "source %s; cm_ctl -V" % self.envFile
        status, output = subprocess.getstatusoutput(getCMVersionCmd)
        if status != 0:
            self.logger.logExit("Failed to get CM pacakage version.")
        cmVersionList = re.findall(r'.*CM (\d.*\d) build', output)
        if len(cmVersionList) == 0:
            self.logger.logExit("Failed to get CM pacakage version.")
        cmVersion = cmVersionList[0]

        # getGaussdbVersionCmd = "source %s; gaussdb -V" % self.envFile
        # status, output = subprocess.getstatusoutput(getGaussdbVersionCmd)
        # if status != 0:
        #     self.logger.logExit("Failed to get gaussdb version.")
        # gaussdbVersionList = re.findall(r'openGauss (\d.*\d) build', output)
        # if len(gaussdbVersionList) == 0:
        #     self.logger.logExit("Failed to get gaussdb version.")
        # gaussdbVersion = gaussdbVersionList[0]

        # if gaussdbVersion != cmVersion:
        #     self.logger.logExit("The version of CM pacakage(%s) is inconsistent "
        #         "with that of gaussdb(%s)." % (cmVersion, gaussdbVersion))

    def createManualStartFile(self):
        self.logger.log("Creating cluster_manual_start file.")
        cmd = """
            if [ ! -f {gaussHome}/bin/cluster_manual_start ]; then
                touch {gaussHome}/bin/cluster_manual_start
            fi
            """.format(gaussHome=self.gaussHome)
        for host in self.hostnames:
            status, output = self.executeCmdOnHost(host, cmd)
            if status != 0:
                self.logger.debug("Command: " + cmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit("Failed to create cluster_manual_start file." + errorDetail)

    def initCMServer(self):
        self.logger.log("Initializing cm_server.")
        for (cmdir, host) in zip(self.cmDirs, self.hostnames):
            # 构建磁盘文件路径
            sharedisk_path = os.path.join(self.clusterInfo.wormPath, "sharedisk")
            votingdisk_path = os.path.join(self.clusterInfo.wormPath, "votingdisk")
            
            cmd = """
                cp {gaussHome}/share/config/cm_server.conf.sample {cmdir}/cm_server/cm_server.conf
                sed 's#log_dir = .*#log_dir = {gaussLog}/cm/cm_server#' {cmdir}/cm_server/cm_server.conf -i
                sed 's#ddb_type = .*#ddb_type = 2#' {cmdir}/cm_server/cm_server.conf -i
                sed 's#dn_arbitrate_mode = .*#dn_arbitrate_mode = share_disk#' {cmdir}/cm_server/cm_server.conf -i
                sed 's#share_disk_path = .*#share_disk_path = \'{sharedisk_path}\'#' {cmdir}/cm_server/cm_server.conf -i
                sed 's#voting_disk_path = .*#voting_disk_path = \'{votingdisk_path}\'#' {cmdir}/cm_server/cm_server.conf -i
                sed 's#disk_timeout = .*#disk_timeout = 6#' {cmdir}/cm_server/cm_server.conf -i
                """.format(gaussHome=self.gaussHome, gaussLog=self.gaussLog, cmdir=cmdir, 
                          sharedisk_path=sharedisk_path, votingdisk_path=votingdisk_path)
            status, output = self.executeCmdOnHost(host, cmd)
            if status != 0:
                self.logger.debug("Command: " + cmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit("Failed to initialize cm_server." + errorDetail)

    def initCMAgent(self):
        self.logger.log("Initializing cm_agent.")
        for (cmdir, host) in zip(self.cmDirs, self.hostnames):
            # 构建投票磁盘文件路径
            votingdisk_path = os.path.join(self.clusterInfo.wormPath, "votingdisk")
            
            cmd = """
                cp {gaussHome}/share/config/cm_agent.conf.sample {cmdir}/cm_agent/cm_agent.conf && 
                sed 's#log_dir = .*#log_dir = {gaussLog}/cm/cm_agent#' {cmdir}/cm_agent/cm_agent.conf -i && 
                sed 's#unix_socket_directory = .*#unix_socket_directory = {gaussHome}#' {cmdir}/cm_agent/cm_agent.conf -i &&
                sed 's#voting_disk_path = .*#voting_disk_path = \'{votingdisk_path}\'#' {cmdir}/cm_agent/cm_agent.conf -i &&
                sed 's#disk_timeout = .*#disk_timeout = 6#' {cmdir}/cm_agent/cm_agent.conf -i
                """.format(gaussHome=self.gaussHome, gaussLog=self.gaussLog, cmdir=cmdir, 
                          votingdisk_path=votingdisk_path)
            status, output = self.executeCmdOnHost(host, cmd)
            if status != 0:
                self.logger.debug("Command: " + cmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit("Failed to initialize cm_agent." + errorDetail)

    def AddGrResource(self):
        """
        Add gr Resource
        """
        self.logger.log("Add gr resource.")
        cmd = "source %s ; sh %s/gr_res.sh" % (self.envFile, self.toolPath)
        for host in self.hostnames:
            status, output = self.executeCmdOnHost(host, cmd)
            if status != 0:
                self.logger.debug("Command: " + cmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit(("Failed to add gr resource to %s." % host) + errorDetail)

    def setMonitorCrontab(self):
        """
        set om_monitor crontab
        """
        self.logger.log("Setting om_monitor crontab.")
        # save old crontab content to cronContentTmpFile
        cronContentTmpFile = os.path.join(self.tmpPath, "cronContentTmpFile_" + str(os.getpid()))
        listCronCmd = "crontab -l > %s" % cronContentTmpFile
        status, output = self.executeCmdOnHost(self.localhostName, listCronCmd)
        is_no_crontab = ("no crontab" in output.lower() and status == 1)
        if status != 0 and not is_no_crontab:
            self.logger.debug("Command: " + listCronCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit(ErrorCode.GAUSS_508["GAUSS_50804"] + errorDetail)
        # if old crontab content contains om_monitor, clear it
        clearMonitorCmd = "sed '/.*om_monitor.*/d' %s -i" % cronContentTmpFile
        status, output = subprocess.getstatusoutput(clearMonitorCmd)
        if status != 0:
            os.remove(cronContentTmpFile)
            self.logger.debug("Command: " + clearMonitorCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit("Failed to clear old om_monitor crontab." + errorDetail)

        # generate om_monitor crontab command and append it to cronContentTmpFile
        startMonitorCmd = "source /etc/profile;(if [ -f ~/.profile ];" \
                      "then source ~/.profile;fi);source ~/.bashrc;"
        if self.envFile != "~/.bashrc":
            startMonitorCmd += "source %s; " % (self.envFile)
        monitorLogPath = os.path.join(self.gaussLog, "cm")
        if not os.path.exists(monitorLogPath):
            os.makedirs(monitorLogPath)
        startMonitorCmd += "nohup om_monitor -L %s/om_monitor >>/dev/null 2>&1 &" % monitorLogPath
        monitorCron = "*/1 * * * * " + startMonitorCmd + os.linesep
        with open(cronContentTmpFile, 'a+', encoding='utf-8') as fp:
            fp.writelines(monitorCron)
            fp.flush()

        # set crontab on other hosts
        setCronCmd = "crontab %s" % cronContentTmpFile
        cleanTmpFileCmd = "rm %s -f" % cronContentTmpFile
        username = getpass.getuser()
        killMonitorCmd = "pkill om_monitor -u %s; " % username
        for host in self.hostnames:
            if host == self.localhostName:
                continue
            # copy cronContentTmpFile to other host
            # 尝试从 clusterInfo 获取对应的 IP 地址
            ssh_ip = host
            try:
                if hasattr(self, 'clusterInfo') and self.clusterInfo:
                    for dbNode in self.clusterInfo.dbNodes:
                        if dbNode.name == host and hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                            ssh_ip = dbNode.sshIps[0]
                            break
            except Exception:
                pass  # 如果获取失败,继续使用原始主机名
            
            if ":" in ssh_ip:
                ssh_ip = "[" + ssh_ip + "]"
            scpCmd = "scp %s %s:%s" % (cronContentTmpFile, ssh_ip, self.tmpPath)
            status, output = subprocess.getstatusoutput(scpCmd)
            if status != 0:
                self.logger.debug("Command: " + scpCmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit(("Failed to copy cronContentTmpFile to %s." % ssh_ip) + errorDetail)
            # set om_monitor crontab
            status, output = self.executeCmdOnHost(host, setCronCmd)
            # cleanup cronContentTmpFile
            self.executeCmdOnHost(host, cleanTmpFileCmd)
            if status != 0:
                self.logger.debug("Command: " + setCronCmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit(ErrorCode.GAUSS_508["GAUSS_50801"] + errorDetail)

            # start om_monitor
            # Firstly, kill residual om_monitor, otherwise cm_agent won't be started if there are residual om_monitor process.
            status, output = self.executeCmdOnHost(host, killMonitorCmd + startMonitorCmd)
            if status != 0:
                self.logger.debug("Command: " + startMonitorCmd)
                errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
                self.logger.logExit((ErrorCode.GAUSS_516["GAUSS_51607"] % "om_monitor") + errorDetail)

        # set crontab on localhost
        status, output = subprocess.getstatusoutput(setCronCmd)
        os.remove(cronContentTmpFile)
        if status != 0:
            self.logger.debug("Command: " + setCronCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit(ErrorCode.GAUSS_508["GAUSS_50801"] + errorDetail)

        status, output = subprocess.getstatusoutput(killMonitorCmd + startMonitorCmd)
        if status != 0:
            self.logger.debug("Command: " + startMonitorCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit((ErrorCode.GAUSS_516["GAUSS_51607"] % "om_monitor") + errorDetail)

    def startCluster(self):
        self.logger.log("Starting cluster.")
        startCmd = "source %s; cm_ctl start" % self.envFile
        status, output = subprocess.getstatusoutput(startCmd)
        if status != 0:
            self.logger.debug("Command: " + startCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit("Failed to start cluster." + errorDetail)

        # status, output = InstallImpl.refreshDynamicFile(self.envFile)
        # if status != 0:
        #     self.logger.error("Failed to refresh dynamic file." + output)

        queryCmd = "source %s; cm_ctl query -Cv" % self.envFile
        status, output = subprocess.getstatusoutput(queryCmd)
        if status != 0:
            self.logger.debug("Command: " + queryCmd)
            errorDetail = "\nStatus: %s\nOutput: %s" % (status, output)
            self.logger.logExit("Failed to query cluster status." + errorDetail)
        self.logger.log(output)
        self.logger.log("Install CM tool success.")
        if self.primaryTermAbnormal:
            self.logger.warn("Term of primary is invalid or not maximal.\n"
                "Hint: To avoid CM arbitration anomalies in this situation, "
                "please restart the database.\n"
                "Command : cm_ctl stop && cm_ctl start")

    @staticmethod
    def refreshStaticFile(envFile, xmlFile):
        """
        refresh static and dynamic file using xml file with cm
        """
        # refresh static file
        cmd = """
            source {envFile};
            gr_om -t generateconf -X {xmlFile} --distribute
            """.format(envFile=envFile, xmlFile=xmlFile)
        status, output = subprocess.getstatusoutput(cmd)
        errorDetail = ""
        if status != 0:
            errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (cmd, status, output)
        return status, errorDetail

    @staticmethod
    def refreshDynamicFile(envFile):
        # refresh dynamic file
        refreshDynamicFileCmd = "source %s; gr_om -t refreshconf" % envFile
        status, output = subprocess.getstatusoutput(refreshDynamicFileCmd)
        errorDetail = ""
        if status != 0:
            errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (refreshDynamicFileCmd, status, output)
        return status, errorDetail

    @staticmethod
    def checkPassword(passwordCA):
        minPasswordLen = 8
        maxPasswordLen = 15
        kinds = [0, 0, 0, 0]
        specLetters = "~!@#$%^&*()-_=+\\|[{}];:,<.>/?"
        if len(passwordCA) < minPasswordLen:
            print("Invalid password, it must contain at least eight characters.")
            return False
        if len(passwordCA) > maxPasswordLen:
            print("Invalid password, it must contain at most fifteen characters.")
            return False
        for c in passwordCA:
            if isdigit(c):
                kinds[0] += 1
            elif isupper(c):
                kinds[1] += 1
            elif islower(c):
                kinds[2] += 1
            elif c in specLetters:
                kinds[3] += 1
            else:
                print("The password contains illegal character: %s." % c)
                return False
        kindsNum = 0
        for k in kinds:
            if k > 0:
                kindsNum += 1
        if kindsNum < 3:
            print("The password must contain at least three kinds of characters.")
            return False
        return True

    def _getPassword(self):
        passwordCA = ""
        passwordCA2 = ""
        tryCount = 0
        while tryCount < 3:
            passwordCA = getpass.getpass("Please input the password for ca cert:")
            passwordCA2 = getpass.getpass("Please input the password for ca cert again:")
            if passwordCA != passwordCA2:
                tryCount += 1
                self.logger.printMessage("The password enterd twice do not match.")
                continue
            if not InstallImpl.checkPassword(passwordCA):
                tryCount += 1
                continue
            break
        if tryCount == 3:
            self.logger.logExit("Maximum number of attempts has been reached.")
        return passwordCA

    def _createCMSslConf(self, certPath):
        """
        Generate config file.
        """
        self.logger.debug("OPENSSL: Create config file.")
        v3CaL = [
            "[ v3_ca ]",
            "subjectKeyIdentifier=hash",
            "authorityKeyIdentifier=keyid:always,issuer:always",
            "basicConstraints = CA:true",
            "keyUsage = keyCertSign,cRLSign",
        ]
        v3Ca = os.linesep.join(v3CaL)

        # Create config file.
        with open(os.path.join(certPath, "openssl.cnf"), "w") as fp:
            # Write config item of Signature
            fp.write(v3Ca)
        self.logger.debug("OPENSSL: Successfully create config file.")

    def _cleanUselessFile(self):
        """
        Clean useless files
        :return: NA
        """
        certPath = os.path.join(self.gaussHome, "share/sslcert/cm")
        keyFiles = ["cacert.pem", "server.crt", "server.key", "client.crt", "client.key",
            "server.key.cipher", "server.key.rand", "client.key.cipher", "client.key.rand"]
        for fileName in os.listdir(certPath):
            filePath = os.path.join(certPath, fileName)
            if fileName not in keyFiles:
                os.remove(filePath)

    def _createCMCALocal(self):
        self.logger.debug("Creating Cm ca files locally.")

        if 'OPENSSL_CONF' in os.environ:
            del os.environ['OPENSSL_CONF']
        os.environ['OPENSSL_CONF'] = '/etc/pki/tls/openssl.cnf'
        certPath = os.path.join(self.gaussHome, "share/sslcert/cm")
        mkdirCmd = f"rm -rf {certPath}; mkdir -p {certPath}"
        status, output = subprocess.getstatusoutput(mkdirCmd)
        if status != 0:
            self.logger.debug(f"Command: {mkdirCmd}\nStatus: {status}\nOutput: {output}")
            self.logger.logExit("Failed to create cert path.")
        self._createCMSslConf(certPath)
        passwd = self._getPassword()

        activePeriod = "10950"
        opensslConf = os.path.join(certPath, "openssl.cnf")
        if not os.path.isfile(opensslConf):
            self.logger.logExit("CM ssl conf does not exist.")

        # 生成 cakey.pem
        gen_cakey_cmd = f'echo "{passwd}" | openssl genrsa -aes256 -f4 -passout stdin -out {certPath}/cakey.pem 2048'
        status, output = subprocess.getstatusoutput(gen_cakey_cmd)
        if status != 0:
            self.logger.logExit("Failed to generate cakey.pem.\n" + output)

        # 生成 cacert.pem
        gen_cacert_cmd = f'echo "{passwd}" | openssl req -new -x509 -passin stdin -days {activePeriod} -key {certPath}/cakey.pem -out {certPath}/cacert.pem -subj "/C=CN/ST=NULL/L=NULL/O=NULL/OU=NULL/CN=CA"'
        status, output = subprocess.getstatusoutput(gen_cacert_cmd)
        if status != 0:
            self.logger.logExit("Failed to generate cacert.pem.\n" + output)

        for role in ["server", "client"]:
            # 生成 key
            gen_key_cmd = f'echo "{passwd}" | openssl genrsa -aes256 -passout stdin -out {certPath}/{role}.key 2048'
            status, output = subprocess.getstatusoutput(gen_key_cmd)
            if status != 0:
                self.logger.logExit(f"Failed to generate {role}.key.\n" + output)
            # 生成 csr
            gen_csr_cmd = f'echo "{passwd}" | openssl req -new -key {certPath}/{role}.key -passin stdin -out {certPath}/{role}.csr -subj "/C=CN/ST=NULL/L=NULL/O=NULL/OU=NULL/CN={role}"'
            status, output = subprocess.getstatusoutput(gen_csr_cmd)
            if status != 0:
                self.logger.logExit(f"Failed to generate {role}.csr.\n" + output)
            # 生成 crt
            gen_crt_cmd = f'echo "{passwd}" | openssl x509 -req -days {activePeriod} -in {certPath}/{role}.csr -CA {certPath}/cacert.pem -CAkey {certPath}/cakey.pem -passin stdin -CAcreateserial -out {certPath}/{role}.crt -extfile {certPath}/openssl.cnf'
            status, output = subprocess.getstatusoutput(gen_crt_cmd)
            if status != 0:
                self.logger.logExit(f"Failed to generate {role}.crt.\n" + output)
            # 删除 csr 文件
            rm_csr_cmd = f'rm -f {certPath}/{role}.csr'
            subprocess.getstatusoutput(rm_csr_cmd)

        # 生成 server cipher 和 rand
        expect_server_cmd = (
            f'expect -c \'spawn cm_ctl encrypt -M server -D {certPath}; '
            f'expect "*password*" {{ send "{passwd}\\r"; exp_continue }}\''
        )
        status, output = subprocess.getstatusoutput(expect_server_cmd)
        if status != 0:
            self.logger.logExit("Failed to encrypt server key.\n" + output)
        
        # 验证生成的文件是否存在
        server_rand_file = os.path.join(certPath, "server.key.rand")
        server_cipher_file = os.path.join(certPath, "server.key.cipher")
        if not os.path.exists(server_rand_file):
            self.logger.logExit("Failed to generate server.key.rand.\n" + output)
        if not os.path.exists(server_cipher_file):
            self.logger.logExit("Failed to generate server.key.cipher.\n" + output)

        # 生成 client cipher 和 rand
        expect_client_cmd = (
            f'expect -c \'spawn cm_ctl encrypt -M client -D {certPath}; '
            f'expect "*password*" {{ send "{passwd}\\r"; exp_continue }}\''
        )
        status, output = subprocess.getstatusoutput(expect_client_cmd)
        if status != 0:
            self.logger.logExit("Failed to encrypt client key.\n" + output)
        
        # 验证生成的文件是否存在
        client_rand_file = os.path.join(certPath, "client.key.rand")
        client_cipher_file = os.path.join(certPath, "client.key.cipher")
        if not os.path.exists(client_rand_file):
            self.logger.logExit("Failed to generate client.key.rand.\n" + output)
        if not os.path.exists(client_cipher_file):
            self.logger.logExit("Failed to generate client.key.cipher.\n" + output)

        # 密码置空
        passwd = ""
        del passwd

        # 设置只读权限
        chmod_cmd = f'chmod 400 {certPath}/*'
        status, output = subprocess.getstatusoutput(chmod_cmd)
        if status != 0:
            self.logger.logExit("Failed to set readonly for cert files.\n" + output)

        self._cleanUselessFile()

    # 创建restapi证书
    def _createRestCALocal(self):
        if not self.restpkg:
            return
        self.logger.debug("Creating rest ca files locally.")

        if 'OPENSSL_CONF' in os.environ:
            del os.environ['OPENSSL_CONF']
        #os.environ['OPENSSL_CONF'] = '/etc/pki/tls/openssl.cnf'
        cmCertPath = os.path.join(self.gaussHome, "share/sslcert/cm")
        certPath = os.path.join(self.gaussHome, "share/sslcert/restapi")
        mkdirCmd = f"rm -rf {certPath}; mkdir -p {certPath}"
        status, output = subprocess.getstatusoutput(mkdirCmd)
        if status != 0:
            self.logger.debug(f"Command: {mkdirCmd}\nStatus: {status}\nOutput: {output}")
            self.logger.logExit("Failed to create cert path.")
        # self._createCMSslConf(certPath)
        passwd = self._getPassword()

        activePeriod = "10950"

        # 生成 cakey.pem
        cmd = f"cp {cmCertPath}/*.key {certPath}; cp {cmCertPath}/*.crt {certPath}; cp {cmCertPath}/*.pem {certPath};"
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            self.logger.debug(f"Command: {cmd}\nStatus: {status}\nOutput: {output}")
            self.logger.logExit("Failed to copy rest cert.")

        # 转换服务端证书为 PKCS12 格式
        # 解密服务端私钥
        decrypt_server_key_cmd = f'openssl rsa -in {certPath}/server.key -passin pass:{passwd} -out {certPath}/server-decrypted.key'
        status, output = subprocess.getstatusoutput(decrypt_server_key_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to decrypt server.key.\n" + output)
        
        # 转换为 PKCS8 格式
        pkcs8_server_key_cmd = f'openssl pkcs8 -topk8 -in {certPath}/server-decrypted.key -out {certPath}/server-pkcs8.key -nocrypt'
        status, output = subprocess.getstatusoutput(pkcs8_server_key_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to convert server key to PKCS8 format.\n" + output)
        
        # 创建 PKCS12 文件(别名设为 rest-server)
        # pkcs12_server_cmd = f'openssl pkcs12 -export -in {certPath}/server.crt -inkey {certPath}/server-pkcs8.key -name "rest-server" -out {certPath}/server.p12 -passout pass:'
        pkcs12_server_cmd = f'openssl pkcs12 -export -in {certPath}/server.crt -inkey {certPath}/server-pkcs8.key -certfile {certPath}/cacert.pem -name "rest-server" -out {certPath}/server.p12 -passout pass:'
        status, output = subprocess.getstatusoutput(pkcs12_server_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to create server.p12.\n" + output)
        
        # 清理临时文件
        rm_temp_cmd = f'rm -f {certPath}/server-decrypted.key {certPath}/server-pkcs8.key'
        subprocess.getstatusoutput(rm_temp_cmd)

        # 转换客户端证书为 PKCS12 格式
        # 解密客户端私钥
        decrypt_client_key_cmd = f'openssl rsa -in {certPath}/client.key -passin pass:{passwd} -out {certPath}/client-decrypted.key'
        status, output = subprocess.getstatusoutput(decrypt_client_key_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to decrypt client.key.\n" + output)
        
        # 转换为 PKCS8 格式
        pkcs8_client_key_cmd = f'openssl pkcs8 -topk8 -in {certPath}/client-decrypted.key -out {certPath}/client-pkcs8.key -nocrypt'
        status, output = subprocess.getstatusoutput(pkcs8_client_key_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to convert client key to PKCS8 format.\n" + output)
        
        # 创建 PKCS12 文件(别名设为 rest-client)
        # pkcs12_client_cmd = f'openssl pkcs12 -export -in {certPath}/client.crt -inkey {certPath}/client-pkcs8.key -name "rest-client" -out {certPath}/client.p12 -passout pass:'
        pkcs12_client_cmd = f'openssl pkcs12 -export -in {certPath}/client.crt -inkey {certPath}/client-pkcs8.key -certfile {certPath}/cacert.pem -name "rest-client" -out {certPath}/client.p12 -passout pass:'
        status, output = subprocess.getstatusoutput(pkcs12_client_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to create client.p12.\n" + output)
        
        # 清理临时文件
        rm_temp_cmd = f'rm -f {certPath}/client-decrypted.key {certPath}/client-pkcs8.key'
        subprocess.getstatusoutput(rm_temp_cmd)

        # 创建JKS格式信任库
        truststore_cmd = f'keytool -import -trustcacerts -alias ca -file {certPath}/cacert.pem -keystore {certPath}/truststore.jks -storepass {passwd} -noprompt'
        status, output = subprocess.getstatusoutput(truststore_cmd)
        if status != 0:
            self.logger.logExit(f"Failed to create truststore.jks.\n" + output)

        rest_ssl_conf = os.path.join(self.gaussHome, "bin/rest_ssl.properties")
        escaped_passwd = passwd.replace('\\', '\\\\').replace('#', '\\#').replace('&', '\\&')
        sed_cmd = f"sed -i 's#^server\\.ssl\\.trust-store-password=.*#server.ssl.trust-store-password={escaped_passwd}#' {rest_ssl_conf}"

        for host in self.hostnames:
            if host == self.localhostName:
                status, output = subprocess.getstatusoutput(sed_cmd)
                if status != 0:
                    self.logger.logExit(f"Failed to update rest_ssl.properties on {host}.\n" + output)
                self.logger.log(f"Successfully updated {rest_ssl_conf} with truststore password on {host}.")
            else:
                status, output = self.executeCmdOnHost(host, sed_cmd)
                if status != 0:
                    self.logger.logExit(f"Failed to update rest_ssl.properties on {host}.\n" + output)
                self.logger.log(f"Successfully updated {rest_ssl_conf} with truststore password on {host}.")

        # 密码置空
        passwd = ""
        del passwd

        # 设置只读权限
        chmod_cmd = f'chmod 600 {certPath}/server.p12 {certPath}/client.p12 {certPath}/truststore.jks && chmod 644 {certPath}/server.crt {certPath}/client.crt {certPath}/cacert.pem'
        status, output = subprocess.getstatusoutput(chmod_cmd)
        if status != 0:
            self.logger.logExit("Failed to change cert files permission.\n" + output)

    def _distributeCA(self):
        self.logger.debug("Distributing CM ca files to other hosts.")
        certPath = os.path.join(self.gaussHome, "share/sslcert/cm")
        createCertPathCmd = "rm {certPath} -rf; mkdir -p {certPath}; chmod 700 {certPath}".format(
            certPath=certPath)
        for host in self.hostnames:
            if host == self.localhostName:
                continue
            status, output = self.executeCmdOnHost(host, createCertPathCmd)
            if status != 0:
                errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (createCertPathCmd, status, output)
                self.logger.debug(errorDetail)
                self.logger.logExit("Failed to create path of CA for CM on host %s." % host)
            # 尝试从 clusterInfo 获取对应的 IP 地址
            ssh_ip = host
            try:
                if hasattr(self, 'clusterInfo') and self.clusterInfo:
                    for dbNode in self.clusterInfo.dbNodes:
                        if dbNode.name == host and hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                            ssh_ip = dbNode.sshIps[0]
                            break
            except Exception:
                pass  # 如果获取失败,继续使用原始主机名
            
            # Determine if the host is an IPv6 address and format accordingly
            if ":" in ssh_ip:
                formatted_host = "[{}]".format(ssh_ip)
            else:
                formatted_host = ssh_ip
        
            # Create the scp command with the formatted host
            scpCmd = "scp {certPath}/* {host}:{certPath}".format(certPath=certPath, host=formatted_host)
            status, output = subprocess.getstatusoutput(scpCmd)
            if status != 0:
                errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (scpCmd, status, output)
                self.logger.debug(errorDetail)
                self.logger.logExit("Failed to create CA for CM.")

    def _distributeRestCA(self):
        self.logger.debug("Distributing Rest ca files to other hosts.")
        certPath = os.path.join(self.gaussHome, "share/sslcert/restapi")
        createCertPathCmd = "rm {certPath} -rf; mkdir -p {certPath}; chmod 700 {certPath}".format(
            certPath=certPath)
        for host in self.hostnames:
            if host == self.localhostName:
                continue
            status, output = self.executeCmdOnHost(host, createCertPathCmd)
            if status != 0:
                errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (createCertPathCmd, status, output)
                self.logger.debug(errorDetail)
                self.logger.logExit("Failed to create path of CA for CM on host %s." % host)

            ssh_ip = host
            try:
                if hasattr(self, 'clusterInfo') and self.clusterInfo:
                    for dbNode in self.clusterInfo.dbNodes:
                        if dbNode.name == host and hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                            ssh_ip = dbNode.sshIps[0]
                            break
            except:
                pass
            
            # Determine if the host is an IPv6 address and format accordingly
            if ":" in ssh_ip:
                formatted_host = "[{}]".format(ssh_ip)
            else:
                formatted_host = ssh_ip
        
            # Create the scp command with the formatted host
            scpCmd = "scp {certPath}/* {host}:{certPath}".format(certPath=certPath, host=formatted_host)
            status, output = subprocess.getstatusoutput(scpCmd)
            if status != 0:
                errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (scpCmd, status, output)
                self.logger.debug(errorDetail)
                self.logger.logExit("Failed to create CA for RestAPI.")

        if os.path.exists(self.restpkg):
            target_path = os.path.join(self.gaussHome, "bin")
            for host in self.hostnames:
                if host == self.localhostName:
                    cmd = "cp {restpkg} {target_path}/".format(
                        restpkg=self.restpkg, target_path=target_path)
                    status, output = subprocess.getstatusoutput(cmd)
                    if status != 0:
                        errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (cmd, status, output)
                        self.logger.debug(errorDetail)
                        self.logger.logExit("Failed to distribute restpkg.")
                    continue

                ssh_ip = host
                try:
                    if hasattr(self, 'clusterInfo') and self.clusterInfo:
                        for dbNode in self.clusterInfo.dbNodes:
                            if dbNode.name == host and hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                                ssh_ip = dbNode.sshIps[0]
                                break
                except:
                    pass

                # Determine if the host is an IPv6 address and format accordingly
                if ":" in ssh_ip:
                    formatted_host = "[{}]".format(ssh_ip)
                else:
                    formatted_host = ssh_ip

                # 分发配置文件到目标节点的 bin 目录
                scpCmd = "scp {restpkg} {host}:{target_path}/".format(
                    restpkg=self.restpkg, host=formatted_host, target_path=target_path)
                status, output = subprocess.getstatusoutput(scpCmd)
                if status != 0:
                    errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s" % (scpCmd, status, output)
                    self.logger.debug(errorDetail)
                    self.logger.logExit("Failed to distribute restpkg to %s." % host)
            
            self.logger.log("Successfully distribute restpkg to all nodes.")

    def createCMCA(self):
        self.logger.log("Creating CM ca files.")
        self._createCMCALocal()
        self._distributeCA()

    def createRestCA(self):
        if not self.restpkg:
            return
        self._createRestCALocal()
        self._distributeRestCA()

    def run(self):
        self.logger.log("Start to install cm tool.")
        self.prepareCMPath()
        self.decompressCMPkg()
        self.createManualStartFile()
        self.initCMServer()
        self.initCMAgent()
        self.createCMCA()
        self.createRestCA()
        self.setMonitorCrontab()
        self.AddGrResource()
        self.startCluster()
class Install:
    """
    The class is used to do perform installation
    """

    def __init__(self):
        self.envFile = ""
        self.xmlFile = ""
        self.gaussHome = ""
        self.gaussLog = ""
        self.toolPath = ""
        self.tmpPath = ""
        self.cmDirs = []
        self.hostnames = []
        self.localhostName = ""
        self.cmpkg = ""
        self.nodesInfo = dict()
        self.clusterStopped = False
        self.maxTerm = 0
        self.primaryTermAbnormal = False
        self.primary = ""
        self.restpkg = ""

    def getLocalhostName(self):
        self.localhostName = socket.gethostname()

    def getEnvParams(self):
        self.gaussHome = getEnvParam(self.envFile, "GAUSSHOME")
        self.gaussLog = getEnvParam(self.envFile, "GAUSSLOG")
        self.toolPath = getEnvParam(self.envFile, "GPHOME")
        self.tmpPath = getEnvParam(self.envFile, "PGHOST")

    def checkExeUser(self):
        if os.getuid() == 0:
            CMLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50105"])

    def usage(self):
        """
cm_install is a utility to deploy CM tool to openGauss database cluster.

Usage:
    cm_install -? | --help
    cm_install -X XMLFILE [-e envFile] --cmpkg=cmpkgPath
General options:
    -X                                 Path of the XML configuration file.
    -e                                 Path of env file.
                                       Default value "~/.bashrc".
    --cmpkg                            Path of CM pacakage.
    -?, --help                         Show help information for this
                                       utility, and exit the command line mode.
        """
        print(self.usage.__doc__)

    def parseCommandLine(self):
        if len(sys.argv) == 1:
            self.usage()
            sys.exit(1)

        try:
            opts, args = getopt.getopt(sys.argv[1:], "?X:e:", ["help", "cmpkg="])
        except getopt.GetoptError as e:
            CMLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50000"] % str(e))

        for opt, value in opts:
            if opt in ("-?", "--help"):
                self.usage()
                sys.exit(0)
            elif opt in ("-X"):
                self.xmlFile = value
            elif opt in ("-e"):
                self.envFile = value
            elif opt in ("--cmpkg"):
                self.cmpkg = value

    def checkParam(self):
        if self.xmlFile == "":
            CMLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50001"] % 'X' + ".")
        checkXMLFile(self.xmlFile)

        if self.cmpkg == "":
            CMLog.exitWithError(ErrorCode.GAUSS_500["GAUSS_50001"] % '-cmpkg' + ".")
        if not os.path.exists(self.cmpkg):
            CMLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50201"] % self.cmpkg)
        if not os.path.isfile(self.cmpkg):
            CMLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50210"] % ("cmpkg " + self.cmpkg))

        if self.envFile == "":
            self.envFile = os.path.join(os.environ['HOME'], ".bashrc")
        if not os.path.exists(self.envFile):
            CMLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50201"] % ("envFile " + self.envFile))
        if not os.path.isfile(self.envFile):
            CMLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50210"] % ("envFile " + self.envFile))
        mppdbEnv = getEnvParam(self.envFile, "MPPDB_ENV_SEPARATE_PATH")
        if mppdbEnv != "":
            self.envFile = mppdbEnv
        if self.envFile == "" or not os.path.exists(self.envFile) or not os.path.isfile(self.envFile):
            CMLog.exitWithError(ErrorCode.GAUSS_518["GAUSS_51802"] % 'MPPDB_ENV_SEPARATE_PATH' + ".")

    def checkOm(self):
        """
        check whether there is om tool
        """
        cmd = "source %s; gr_om --version" % self.envFile
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s\n" % (
                cmd, status, output)
            self.logger.logExit("OM tool is required." + errorDetail)

    def checkXMLFileSecurity(self):
        """
        function : check XML contain DTDs
        input : String
        output : NA
        """
        # Check xml for security requirements
        # if it have "<!DOCTYPE" or it have "<!ENTITY",
        # exit and print "File have security risks."
        try:
            with open(self.xmlFile, "r", encoding='utf-8') as fb:
                lines = fb.readlines()
            for line in lines:
                if re.findall("<!DOCTYPE", line) or re.findall("<!ENTITY", line):
                    raise Exception("File have security risks.")
        except Exception as e:
            raise Exception(str(e))

    def initParserXMLFile(self):
        """
        function : Init parser xml file
        input : String
        output : Object
        """
        try:
            # check xml for security requirements
            self.checkXMLFileSecurity()
            dom_tree = ETree.parse(self.xmlFile)
            rootNode = dom_tree.getroot()
        except Exception as e:
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51236"] + " Error: \n%s." % str(e))
        return rootNode

    def generateXml(self):
        """
        function : generate xml file
        input : String
        output : Object
        """
        cmd = "source %s; ./gr_om generate-xml %s" % (self.envFile, self.xmlFile)
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            self.logger.logExit((ErrorCode.GAUSS_514["GAUSS_51400"] % cmd) + f"\nStatus:{status}\nOutput:{output}")

    def getInfoListOfAllNodes(self):
        """
        get hostname and cmDir list of all nodes
        check other CM infos in xml
        TODO: check the consistence of xml and installed cluster.
        """
        self.localhostName = getLocalhostName()

        # get hostnames and port from static file
        cmd = "source %s; ./gr_om view %s" % (self.envFile, self.xmlFile)
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            self.logger.logExit((ErrorCode.GAUSS_514["GAUSS_51400"] % cmd) + \
                f"\nStatus:{status}\nOutput:{output}")
        nodesStaticInfoStr = re.split("azName.*:.*", output)
        if len(nodesStaticInfoStr) == 0:
            self.logger.logExit("Failed to get cluster info from static file.")
        if len(nodesStaticInfoStr) < 2:
            self.logger.logExit("CM is not supported in single instance.")
        nodesStaticInfo = nodesStaticInfoStr[1:]
        for nodeInfo in nodesStaticInfo:
            if nodeInfo == "":
                continue
            
            # Parse node name with error checking
            nodename_match = re.findall("nodeName:(.*)", nodeInfo)
            if not nodename_match:
                self.logger.logExit(f"Failed to parse nodeName from nodeInfo: {nodeInfo}")
            nodename = nodename_match[0]
            self.hostnames.append(nodename)
            
            # Parse data path with error checking
            dataPath_match = re.findall("datanodeLocalDataPath.*:(.*)", nodeInfo)
            if not dataPath_match:
                self.logger.logExit(f"Failed to parse datanodeLocalDataPath from nodeInfo: {nodeInfo}")
            dataPath = dataPath_match[0]
            
            # Parse port with error checking
            port_match = re.findall("datanodePort.*:(.*)", nodeInfo)
            if not port_match:
                self.logger.logExit(f"Failed to parse datanodePort from nodeInfo: {nodeInfo}")
            port = port_match[0]
            
            self.nodesInfo[nodename] = {"dataPath": dataPath, "port": port}

        # get node info from XML
        hostnamesInXML = []
        rootNode = self.initParserXMLFile()
        elementName = 'DEVICELIST'
        if not rootNode.findall('DEVICELIST'):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51200"] % elementName)
        deviceArray = rootNode.findall('DEVICELIST')[0]
        deviceNodes = deviceArray.findall('DEVICE')
        cmDict = {"cmsNum": "", "cmServerPortBase": "", "cmServerPortStandby": "",
             "cmServerlevel": "", "cmServerListenIp1": "", "cmServerRelation": ""}
        for dev in deviceNodes:
            paramList = dev.findall('PARAM')
            for param in paramList:
                paraName = param.attrib['name']
                paraValue = param.attrib['value']
                if paraName == 'name':
                    hostnamesInXML.append(paraValue)
                elif paraName == 'cmDir':
                    self.cmDirs.append(paraValue)
                elif paraName == 'cmServerLevel':
                    cmDict['cmServerlevel'] = paraValue
                elif paraName in cmDict.keys():
                    cmDict[paraName] = paraValue
        # check whether XML contains all nodes info
        if self.hostnames != hostnamesInXML:
            self.logger.logExit("XML info is not consistent with static file.")
        # check params in xml
        for item in cmDict:
            if item == 'cmServerPortStandby':
                continue
            if cmDict[item] == "":
                self.logger.logExit(ErrorCode.GAUSS_512["GAUSS_51200"] % item)
        if cmDict['cmsNum'] != '1':
            self.logger.logExit(ErrorCode.GAUSS_500["GAUSS_50024"] % 'cmsNum')
        if cmDict['cmServerlevel'] != '1':
            self.logger.logExit(ErrorCode.GAUSS_500["GAUSS_50024"] % 'cmServerlevel')
        if not cmDict['cmServerPortBase'].isdigit():
            self.logger.logExit(ErrorCode.GAUSS_500["GAUSS_50024"] % 'cmServerPortBase')
        if cmDict['cmServerPortStandby'] != "" and not cmDict['cmServerPortStandby'].isdigit():
            self.logger.logExit(ErrorCode.GAUSS_500["GAUSS_50024"] % 'cmServerPortStandby')
        if len(self.hostnames) != len(self.cmDirs):
            self.logger.logExit("\"cmDir\" of all nodes must be provided.")

    def checkHostTrust(self):
        checkHostsTrust(self.hostnames)

    def initLogger(self):
        logPath = os.path.join(self.gaussLog, "cm", "cm_tool")
        if not os.path.exists(logPath):
            os.makedirs(logPath)
        self.logger = CMLog(logPath, "cm_install", "cm_install")

    def checkCM(self):
        """
        Check whether there is CM in current cluster.
        """
        checkCMExistCmd = "source %s; cm_ctl query -Cv | " \
            "grep 'CMServer State' > /dev/null" % self.envFile
        status, output = subprocess.getstatusoutput(checkCMExistCmd)
        if status == 0:
            self.logger.logExit("CM exists in current cluster.")

    def checkCluster(self):
        """
        check the status of the current cluster
        """
        cmd = "source %s; gr_om -t status --detail" % self.envFile
        status, output = subprocess.getstatusoutput(cmd)
        if status != 0:
            erroeDetail = "Detail:\nCommand:\n" + cmd + "\noutput:" + output
            self.logger.logExit(ErrorCode.GAUSS_516["GAUSS_51600"] + erroeDetail)
        if "cluster_state   : Unavailable" in output:
            # It’s permitted to deploy CM tool when cluster is stopped,
            # but not permitted when cluster is unavailable.
            if output.count("Manually stopped") == len(self.hostnames):
                self.clusterStopped = True
                return
            self.logger.logExit("The cluster is unavailable currently.")
        if "cluster_state   : Normal" not in output:
            self.logger.logExit("Cluster is running but its status is abnormal.")
        # check whether term of primary is invalid and biggest.
        primaryCount = 0
        primaryTerm = 0
        sqlCmd = "select term from pg_last_xlog_replay_location();"
        for host in self.hostnames:
            isLocal = False
            if host == self.localhostName:
                isLocal = True
            findPrimaryCmd = "source %s; gs_ctl query -D %s | grep -i 'local_role.*Primary' > /dev/null" % \
                (self.envFile, self.nodesInfo[host]["dataPath"])
            notPrimary, output = executeCmdOnHost(host, findPrimaryCmd, isLocal)
            if notPrimary == 0:
                primaryCount += 1
            getTermLsnCmd = "source %s; gsql -d postgres -p %s -tA -c '%s'" % \
                (self.envFile, self.nodesInfo[host]["port"], sqlCmd)
            status, term = executeCmdOnHost(host, getTermLsnCmd, isLocal)
            if status != 0:
                self.logger.logExit("Failed to get term of host %s." % host)
            if notPrimary == 0:
                primaryTerm = int(term)
            if self.maxTerm < int(term):
                self.maxTerm = int(term)

        if primaryCount != 1:
            self.logger.logExit("The number of primary is invalid.")
        if primaryTerm == 0 or primaryTerm < self.maxTerm:
            self.primaryTermAbnormal = True
            self.logger.warn("Term of primary is invalid or not maximal.\n"
                "Hint: it seems that the cluster is newly installed, so it's "
                "recommended to deploy CM tool while installing the cluster.")

    def run(self):
        self.checkExeUser()
        self.parseCommandLine()
        self.checkParam()
        self.initLogger()  # 提前初始化 logger
        self.generateXml()
        self.getEnvParams()
        self.checkCM()
        self.getInfoListOfAllNodes()
        self.getLocalhostName()
        # self.checkHostTrust()
        
        # 设置全局主机名到 IP 的映射
        try:
            clusterInfo = dbClusterInfo()
            clusterInfo.initFromXml(self.xmlFile)
            hostname_ip_map = {}
            for dbNode in clusterInfo.dbNodes:
                if hasattr(dbNode, 'sshIps') and dbNode.sshIps:
                    hostname_ip_map[dbNode.name] = dbNode.sshIps[0]
            set_hostname_ip_map(hostname_ip_map)
        except Exception:
            pass  # 如果设置失败,继续使用原始主机名
        
        installImpl = InstallImpl(self)
        installImpl.run()

####################################################################
##read cluster functions
####################################################################

def checkPathVaild(obtainpath):
    """
    function: check path vaild
    input : envValue
    output: NA
    """
    PATH_CHECK_LIST = [" ", "|", ";", "&", "$", "<", ">", "`", "\\", "'", "\"",
                       "{", "}", "(", ")", "[", "]", "~", "*", "?", "!", "\n"]
    if not obtainpath.strip():
        return
    if any(ill_char in obtainpath for ill_char in PATH_CHECK_LIST):
        raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % obtainpath +
                        " There are illegal characters in the path.")

def getEnvParam(envFile, param):
    """
    Get environment parameter value from environment file.
    
    Args:
        envFile (str): Path to the environment file
        param (str): Parameter name to retrieve
        
    Returns:
        str: Parameter value
    """
    cmd = "source {envFile}; echo ${param}".format(envFile=envFile, param=param)
    status, output = subprocess.getstatusoutput(cmd)
    if status != 0:
        errorDetail = "\nCommand: %s\nStatus: %s\nOutput: %s\n" % (
            cmd, status, output)
        CMLog.exitWithError(ErrorCode.GAUSS_518["GAUSS_51802"] % param)
    return output

def getLocalhostName():
    """
    Get the local hostname.
    
    Returns:
        str: Local hostname
    """
    return socket.gethostname()

# 全局主机名到 IP 的映射字典
_hostname_to_ip_map = {}

def set_hostname_ip_map(hostname_ip_map):
    """
    Set global hostname to IP mapping.
    
    Args:
        hostname_ip_map (dict): Dictionary mapping hostnames to IP addresses
    """
    global _hostname_to_ip_map
    _hostname_to_ip_map = hostname_ip_map

def executeCmdOnHost(host, cmd, isLocal=False):
    """
    Execute command on a remote host via SSH.
    
    Args:
        host (str): Target hostname or IP address
        cmd (str): Command to execute
        isLocal (bool): Whether to execute locally (default: False)
        
    Returns:
        tuple: (status, output) - command exit status and output
    """
    if not isLocal:
        # Try to get IP address from global mapping
        if host in _hostname_to_ip_map:
            host = _hostname_to_ip_map[host]
        cmd = 'ssh -q -o ConnectTimeout=5 %s \"%s\"' % (host, cmd)
    status, output = subprocess.getstatusoutput(cmd)
    return status, output

def checkXMLFile(xmlFile):
    """
    Check XML file validity.
    
    Performs the following checks:
    1. Check whether XML file exists
    2. Check whether XML file is a regular file
    3. Check read permission
    
    Args:
        xmlFile (str): Path to the XML file
        
    Raises:
        Exception: If any validation fails
    """
    if xmlFile.startswith('~/'):
        homePath = os.path.expanduser('~')
        xmlFile = homePath + xmlFile[1:]
    if not os.path.exists(xmlFile):
        CMLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50201"] % "xmlFile")
    if not os.path.isfile(xmlFile):
        CMLog.exitWithError(ErrorCode.GAUSS_502["GAUSS_50210"] % "xmlFile")
    if not os.access(xmlFile, os.R_OK):
        CMLog.exitWithError(ErrorCode.GAUSS_501["GAUSS_50100"] % (xmlFile, "current user"))

def checkHostsTrust(hosts):
    """
    Check SSH trust between current host and the given hosts.
    
    Args:
        hosts (list): List of hostnames to check trust with
        
    Raises:
        Exception: If any host lacks SSH trust
    """
    hostsWithoutTrust = []
    for host in hosts:
        checkTrustCmd = (
            "ssh -o ConnectTimeout=3 -o ConnectionAttempts=5 "
            "-o PasswordAuthentication=no -o StrictHostKeyChecking=no "
            f"{host} 'pwd > /dev/null'"
        )
        status, output = subprocess.getstatusoutput(checkTrustCmd)
        if status != 0:
            hostsWithoutTrust.append(host)
    if hostsWithoutTrust:
        CMLog.exitWithError(ErrorCode.GAUSS_511["GAUSS_51100"] % ','.join(hostsWithoutTrust))

def GetHostIpOrName():
    """
    function: Obtaining the local IP address
    input: NA
    output: NA
    """
    env_dist = os.environ
    if "HOST_IP" not in list(env_dist.keys()):
        return getHostName()
    host_ip = env_dist.get("HOST_IP")
    if host_ip is not None and isIpValid(host_ip):
        return host_ip
    try:
        # Obtain the address of the local host
        addr_info = socket.getaddrinfo(socket.gethostname(), None)
        for info in addr_info:
            # Extract IPv4 or IPv6 addresses from address information
            host_ip = info[ADDRESS_FAMILY_INDEX][IP_ADDRESS_INDEX]
    except Exception as e:
        raise e
    return host_ip

def getHostName():
    """
    function : Get host name
    input : NA
    output: string
    """
    host_cmd = findCmdInPath("hostname")
    (status, output) = subprocess.getstatusoutput(host_cmd)
    # if cmd failed, then exit
    if status != 0:
        raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % "host name"
                        + "The cmd is %s" % host_cmd)
    return output


def isIpValid(ip_address):
    """
    function : check if the input ip address is valid
    input : String
    output : bool
    """
    try:
        ipaddress.ip_address(ip_address)
        return True
    except ValueError:
        return False

def get_ip_version(ip_address):
    try:
        ip = ipaddress.ip_address(ip_address)
        # If hostname is a valid IP address (both IPv4 and IPv6)
        if(ip.version == 4):
            return NET_IPV4
        if(ip.version == 6):
            return NET_IPV6
    except ValueError:
        # hostname may be a hostname or an unvalid ip
        return ""

def createFileInSafeMode(file_path, mode=stat.S_IWUSR | stat.S_IRUSR):
    """安全创建文件并确保权限为0o600"""
    try:
        # 创建父目录(如果不存在)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # 原子操作创建文件并设置权限
        fd = os.open(file_path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, mode)
        os.close(fd)
    except FileExistsError:
        pass  # 文件已存在则跳过
    except PermissionError as e:
        raise Exception(f"No permission to operate path: {file_path}. Error:\n{str(e)}")
    except OSError as e:
        raise Exception(f"Failed to create file: {file_path}. Error:\n{str(e)}")


def removeDirectory(path):
    """
    function: remove the content in a directory
    input:path
    output:true
    """
    if "*" in path:
        path = withAsteriskPath(path)
        cmd = "%s %s" % (getRemoveCmd("directory"), path)
    else:
        cmd = "%s '%s'" % (getRemoveCmd("directory"), path)
    (status, output) = subprocess.getstatusoutput(cmd)
    if status != 0:
        raise Exception(ErrorCode.GAUSS_502["GAUSS_50209"] % path +
                        " Error:\n%s." % output + "The cmd is %s" % cmd)
    return True

def withAsteriskPath(path):
    """
    function: deal with the path with *
    input: the path to deal with
    output: cmd
    """
    path_dir_list = os.path.realpath(path).split(os.path.sep)[1:]
    path = "'"
    for dir_name in path_dir_list:
        if "*" in dir_name:
            dir_path = "'" + os.path.sep + dir_name + "'"
        else:
            dir_path = os.path.sep + dir_name
        path += dir_path
    if path[-1] == "'":
        path = path[:-1]
    else:
        path += "'"
    return path

def getRemoveCmd(path_type):
    """
    function: get remove cmd
    input  : path_type
    output : str
    """
    opts = " "
    if path_type == "file":
        opts = " -f "
    elif path_type == "directory":
        opts = " -rf "
    return findCmdInPath('rm') + opts

def findCmdInPath(cmd, additional_paths=None, print_error=True):
    """
    function: find cmd in path
    input: cmd, additional_paths, printError
    output: NA
    """
    global CMD_CACHE
    if additional_paths is None:
        additional_paths = []
    if cmd not in CMD_CACHE:
        # Search additional paths and don't add to cache.
        for p in additional_paths:
            f = os.path.join(p, cmd)
            if os.path.exists(f):
                return f

        for p in CMD_PATH:
            f = os.path.join(p, cmd)
            if os.path.exists(f):
                CMD_CACHE[cmd] = f
                return f

        if cmd == "killall":
            gphome = os.getenv("GPHOME")
            if gphome is None or \
                    not os.path.exists(os.path.join(gphome, "script/killall")):
                gphome = os.path.dirname(os.path.realpath(__file__)) \
                            + "/../../.."
            gphome = gphome.replace("\\", "\\\\").replace('"', '\\"\\"')
            #SecurityChecker.check_injection_char(gphome)
            if gphome != "" and os.path.exists(os.path.join(gphome,
                                                            "script/killall")):
                return os.path.join(gphome, "script/killall")
            else:
                raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % "killall")

        if print_error:
            print('Command %s not found' % cmd)
        search_path = CMD_PATH[:]
        search_path.extend(additional_paths)
        raise CommandNotFoundException(cmd, search_path)
    else:
        return CMD_CACHE[cmd]

def getEnvironmentParameterValue(environment_parameter_name, user, env_file=None):
    """
    function : Get the environment parameter value from user
    input : String,String
    output : String
    """
    if env_file is not None:
        user_profile = env_file
    else:
        user_profile = getMpprcFile()
    # buid the shell command
    # SecurityChecker.check_injection_char(environment_parameter_name)
    execute_cmd = "echo $%s" % environment_parameter_name
    cmd = getExecuteCmdWithUserProfile(user, user_profile, execute_cmd)
    (status, output) = subprocess.getstatusoutput(cmd)
    if status == 0:
        env_value = output.split("\n")[0]
        env_value = env_value.replace("\\", "\\\\").replace('"', '\\"\\"')
        # SecurityChecker.check_injection_char(env_value)
        return env_value
    return ""

def getMpprcFile():
    """
    function : get mpprc file
    input : NA
    output : String
    """
    # get mpp file by env parameter MPPDB_ENV_SEPARATE_PATH
    mpprc_file = getEnv("MPPDB_ENV_SEPARATE_PATH")
    if mpprc_file != "" and mpprc_file is not None:
        if not os.path.isabs(mpprc_file):
            raise Exception(ErrorCode.GAUSS_512["GAUSS_51206"] % mpprc_file)
        if not os.path.exists(mpprc_file):
            raise Exception(ErrorCode.GAUSS_502["GAUSS_50201"] % mpprc_file)
    elif os.getuid() == 0:
        return "/etc/profile"
    else:
        user_absolute_home_path = getUserHomePath()
        mpprc_file = os.path.join(user_absolute_home_path, ".bashrc")
    if not os.path.isfile(mpprc_file):
        raise Exception(ErrorCode.GAUSS_502["GAUSS_50210"] % mpprc_file)
    return mpprc_file

def getUserHomePath():
    """
    Get home path of user
    """
    # converts the relative path to an absolute path
    cmd = "echo ~ 2>/dev/null"
    (status, output) = subprocess.getstatusoutput(cmd)
    if status != 0:
        raise Exception(ErrorCode.GAUSS_502["GAUSS_50219"] % "user home")
    return output

def getExecuteCmdWithUserProfile(user, user_profile, execute_cmd,
                                     ignore_error=True):
    """
    """
    if (user != "") and (os.getuid() == 0):
        cmd = "su - %s -c 'source %s; %s'" % (user, user_profile, execute_cmd)
    else:
        cmd = "source %s; %s" % (user_profile, execute_cmd)
    if ignore_error:
        cmd += " 2>/dev/null"
    return cmd

def getEnv(env_param, default_value=None):
    """
    function: get the filter environment variable
    input:envparam: String
            default_value: String
    output:envValue
    """
    env_value = os.getenv(env_param)

    if env_value is None:
        if default_value:
            return default_value
        else:
            return env_value

    env_value = env_value.replace("\\", "\\\\").replace('"', '\\"\\"')

    # SecurityChecker.check_injection_char(env_value)

    return env_value

def isdigit(c):
    return '0' <= c <= '9'

def islower(c):
    return 'a' <= c <= 'z'

def isupper(c):
    return 'A' <= c <= 'Z'

def json_to_xml(json_file):
    """将JSON配置文件转换为XML格式"""
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            config = json.load(f)
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return None, None, None
    
    # 提取env_file和user信息
    env_file = config.get('cluster', {}).get('env_file', '')
    user = config.get('cluster', {}).get('user', '')
    
    # 构建节点列表字符串
    node_names = [node['name'] for node in config['cluster']['nodes']]
    back_ips = [node['ip'] for node in config['cluster']['nodes']]
    back_rest_ips = [node.get('rest_ip', '') for node in config['cluster']['nodes']]
    gr_nodes_list = []
    rest_nodes_list = []
    for i, node in enumerate(config['cluster']['nodes']):
        gr_nodes_list.append(f"{i}:{node['ip']}:{node['gr_port']}")
        rest_ip = node.get('rest_ip', '')
        rest_port = node.get('rest_port', '')
        if rest_ip and rest_port:
            rest_nodes_list.append(f"{i}:{rest_ip}:{rest_port}")
    # 创建临时XML文件
    xml_content = f'''<?xml version="1.0" encoding="UTF-8"?>
<ROOT>
    <CLUSTER>
        <PARAM name="clusterName" value="{config['cluster']['name']}"/>
        <PARAM name="nodeNames" value="{','.join(node_names)}"/>
        <PARAM name="installPath" value="{config['cluster']['install_path']}"/>
        <PARAM name="backIp1s" value="{','.join(back_ips)}"/>
        <PARAM name="backRestIp1s" value="{','.join(back_rest_ips)}"/>
        <PARAM name="gr_nodes_list" value="{','.join(gr_nodes_list)}"/>
        <PARAM name="rest_nodes_list" value="{','.join(rest_nodes_list)}"/>
        <PARAM name="wormPath" value="{config['cluster'].get('worm_path', '/tmp/')}"/>
        <PARAM name="caPath" value="{config['cluster'].get('ca_path', 'CA')}"/>
    </CLUSTER>
    <DEVICELIST>'''
    
    # 为每个节点生成DEVICE配置
    for i, node in enumerate(config['cluster']['nodes']):
        xml_content += f'''
        <DEVICE sn="{node['name']}">
            <PARAM name="name" value="{node['name']}"/>
            <PARAM name="azName" value="AZ1"/>
            <PARAM name="azPriority" value="1"/>
            <PARAM name="backIp1" value="{node['ip']}"/>
            <PARAM name="sshIp1" value="{node['ip']}"/>
            <PARAM name="cmDir" value="{config['cluster']['install_path']}cm"/>
            <PARAM name="grIp1" value="{node['ip']}"/>
            <PARAM name="listen_addr" value="{node['ip']}"/>
            <PARAM name="listen_port" value="{node['gr_port'] + 10}"/>'''
        
        # 为第一个节点添加额外的CM参数
        if i == 0:
            other_nodes = [n['name'] for n in config['cluster']['nodes'][1:]]
            other_ips = [n['ip'] for n in config['cluster']['nodes']]
            
            # 生成dataNode1参数:根据节点数量动态生成
            # 格式:${install_path}/data/dn,${hostnames[1]},${install_path}/data/dn,${hostnames[2]},${install_path}/data/dn
            data_node_parts = []
            for j, node in enumerate(config['cluster']['nodes']):
                if j > 0:  # 从第二个节点开始添加节点名
                    data_node_parts.append(node['name'])
                data_node_parts.append(f"{config['cluster']['install_path']}data/dn")
            dataNode1_value = ','.join(data_node_parts)
            
            xml_content += f'''
            <PARAM name="cmsNum" value="1"/>
            <PARAM name="cmServerPortBase" value="{config['cluster']['nodes'][0]['gr_port'] + 20}"/>
            <PARAM name="cmServerListenIp1" value="{','.join(other_ips)}"/>
            <PARAM name="cmServerlevel" value="1"/>
            <PARAM name="cmServerRelation" value="{','.join([node['name']] + other_nodes)}"/>
            <PARAM name="dataNum" value="1"/>
            <PARAM name="dataPortBase" value="{config['cluster']['nodes'][0]['gr_port'] + 30}"/>
            <PARAM name="dataNode1" value="{dataNode1_value}"/>'''
        xml_content += '''
        </DEVICE>'''
    
    xml_content += '''
    </DEVICELIST>
</ROOT>'''
    
    # 创建临时文件
    temp_xml = tempfile.NamedTemporaryFile(mode='w', suffix='.xml', delete=False, encoding='utf-8')
    temp_xml.write(xml_content)
    temp_xml.close()
    
    return temp_xml.name, env_file, user

def main():
    clusterInfo = dbClusterInfo()
    
    # 检查是否有JSON配置文件参数
    json_file = None
    xml_file = None
    json_env_file = None
    json_user = None
    
    # 检查命令行参数中是否有JSON文件
    if len(sys.argv) > 1:
        # 检查是否是trust、uninstall、gr_certs或preinstall命令
        is_json_command = 'trust' in sys.argv or 'uninstall' in sys.argv or 'gr_certs' in sys.argv or 'preinstall' in sys.argv
        
        for i, arg in enumerate(sys.argv):
            if arg.endswith('.json'):
                json_file = arg
                # 如果是trust或uninstall命令,跳过XML转换
                if is_json_command:
                    # 仍然需要读取JSON中的env_file和user信息
                    try:
                        with open(json_file, 'r', encoding='utf-8') as f:
                            config = json.load(f)
                            json_env_file = config.get('cluster', {}).get('env_file', '')
                            json_user = config.get('cluster', {}).get('user', '')
                    except Exception as e:
                        print(f"Error reading JSON file: {e}")
                    break
                # 将JSON文件转换为XML
                xml_file, json_env_file, json_user = json_to_xml(json_file)
                if xml_file:
                    print(f"Converted JSON config '{json_file}' to XML: {xml_file}")
                    # 替换命令行参数中的JSON文件为XML文件
                    sys.argv[i] = xml_file
                else:
                    print(f"Failed to convert JSON file: {json_file}")
                    sys.exit(1)
                break
    
    parser = argparse.ArgumentParser(description='oGRecorder OM Tool')
    subparsers = parser.add_subparsers(dest='command', required=True)

    # View command
    view_parser = subparsers.add_parser('view', help='View cluster information')
    view_parser.add_argument('xmlFile', help='XML configuration file (or JSON file)')

    # Generate-xml command
    gen_parser = subparsers.add_parser('generate-xml', help='Generate XML configuration')
    gen_parser.add_argument('xmlFile', help='XML configuration file (or JSON file)')

    # Install All command (GR + CM)
    install_all_parser = subparsers.add_parser('install', help='Install both GR and CM packages')
    install_all_parser.add_argument('-X', dest='xmlFile', required=True, help='XML configuration file (or JSON file)')
    install_all_parser.add_argument('--grpkg', dest='gr_package', required=True, help='Path to GR installation package')
    install_all_parser.add_argument('--cmpkg', dest='cm_package', required=True, help='Path to CM package')
    install_all_parser.add_argument('--restpkg', dest='rest_package', required=False, help='Path to CM-RESTAPI package')
    install_all_parser.add_argument('-n', '--nodeids', dest='target_nodeids', nargs='+', type=int, default=None,
        help='(Not supported for install, only for upgrade)')

    # GR uninstall command
    uninstall_parser = subparsers.add_parser('uninstall', help='Uninstall cluster')
    uninstall_parser.add_argument('-X', dest='config_file', required=True, help='JSON configuration file path')

    # GR certs command
    gr_certs_parser = subparsers.add_parser('gr_certs', help='Generate and distribute GR certs, then reload')
    gr_certs_parser.add_argument('-X', dest='config_file', required=True, help='JSON configuration file path')

    # GR preinstall command
    preinstall_parser = subparsers.add_parser('preinstall', help='Preinstall cluster')
    preinstall_parser.add_argument('-X', dest='config_file', required=True, help='JSON configuration file path')

    # GR / CM upgrade command
    upgrade_parser = subparsers.add_parser(
        'upgrade',
        help='Upgrade GR/CM to new version. '
             'With --nodeids: online GR upgrade on specific nodes; '
             'without --nodeids: cluster upgrade (GR, CM or both).'
    )
    upgrade_parser.add_argument(
        '-X', dest='xmlFile', required=True,
        help='XML configuration file (or JSON file)'
    )
    upgrade_parser.add_argument(
        '--grpkg', dest='gr_package', required=False,
        help='Path to new GR installation package'
    )
    upgrade_parser.add_argument(
        '--cmpkg', dest='cm_package', required=False,
        help='Path to new CM package (only valid in cluster upgrade mode without --nodeids)'
    )
    upgrade_parser.add_argument(
        '-n', '--nodeids', dest='target_nodeids', nargs='+', type=int, default=None,
        help='List of node IDs to upgrade (starting from 1, default: all nodes)'
    )

    # SSH trust command
    trust_parser = subparsers.add_parser('trust', help='Automatically configure SSH mutual trust for cluster')
    trust_parser.add_argument('-X', dest='config_file', required=True, help='JSON configuration file path')

    args = parser.parse_args()    
    
    # 直接使用JSON中的值,并检查必要参数是否存在
    if args.command in ['install', 'gr_certs', 'preinstall']:
        if not json_env_file:
            print("Error: env_file not found in JSON configuration")
            sys.exit(1)
        args.envFile = json_env_file
        print(f"Using env file from JSON config: {json_env_file}")
    
    if args.command == 'preinstall':
        if not json_user:
            print("Error: user not found in JSON configuration")
            sys.exit(1)
        args.user = json_user
        print(f"Using user from JSON config: {json_user}")
    
    # Dispatch to appropriate handler based on command
    try:
        if args.command == 'view':
            clusterInfo.initFromXml(args.xmlFile)
            file_path = "%s/bin/cluster_static_config" % (clusterInfo.appPath)
            clusterInfo.printStaticConfig(args.xmlFile, file_path)
        elif args.command == 'generate-xml':
            clusterInfo.initFromXml(args.xmlFile)
            clusterInfo.doRebuildConf(args.xmlFile)
        elif args.command == 'install':
            if args.target_nodeids:
                print("Error: -n/--nodeids is only supported for 'upgrade' command, not 'install'")
                sys.exit(1)
            clusterInfo.gr_install(args.xmlFile, args.gr_package, args.rest_package, json_file)
            install = Install()
            install.xmlFile = args.xmlFile
            install.envFile = args.envFile
            install.cmpkg = args.cm_package
            install.restpkg = args.rest_package
            install.run()
        elif args.command == 'preinstall':
            clusterInfo.preinstall(args.config_file, args.user, args.envFile)
        elif args.command == 'uninstall':
            clusterInfo.uninstall(args.config_file)
        elif args.command == 'gr_certs':
            clusterInfo.distribute_gr_certs(args.config_file, args.envFile)
        elif args.command == 'upgrade':
            # ========== 包合法性验证 ==========
            # 在执行升级前验证传入的包类型是否正确
            print("=" * 60)
            print("PACKAGE VERIFICATION")
            print("=" * 60)
            try:
                if args.gr_package:
                    print(f"  [1/2] Verifying GR package: {args.gr_package}")
                    clusterInfo._verify_package_type(args.gr_package, 'gr')
                    print("        ✓ GR package verification passed.")
                else:
                    print("  [1/2] GR package: Not specified (skip)")
                    
                if args.cm_package:
                    print(f"  [2/2] Verifying CM package: {args.cm_package}")
                    clusterInfo._verify_package_type(args.cm_package, 'cm')
                    print("        ✓ CM package verification passed.")
                else:
                    print("  [2/2] CM package: Not specified (skip)")
                    
                print("=" * 60)
                print("All package verifications passed!")
                print("=" * 60)
                print()
                
            except PackageVerificationError as e:
                print()
                print(e.get_user_friendly_message())
                print()
                print("Upgrade aborted due to package verification failure.")
                sys.exit(e.error_code)
            
            # ========== 升级流程 ==========
            # 整体升级模式:指定 --cmpkg 时进入集群升级流程
            if args.cm_package:
                if args.target_nodeids:
                    print("Error: -n/--nodeids cannot be used with --cmpkg (CM upgrade requires full-cluster stop)")
                    sys.exit(1)
                # 同时升级 GR 和 CM:一次停止、一次启动、失败全部回滚
                if args.gr_package:
                    print("Combined upgrade: upgrading GR + CM together (single stop/start, full rollback on failure).")
                    clusterInfo.gr_cm_combined_upgrade(args.xmlFile, args.gr_package, args.cm_package)
                # 仅升级 CM
                else:
                    clusterInfo.cm_cluster_upgrade(args.xmlFile, args.cm_package)
            # 在线节点级升级:指定 -n/--nodeids,只支持 GR 升级
            elif args.target_nodeids:
                if not args.gr_package:
                    print("Error: --grpkg is required when -n/--nodeids is specified (online node upgrade mode)")
                    sys.exit(1)
                clusterInfo.gr_upgrade(args.xmlFile, args.gr_package, args.target_nodeids)
            # 集群级 GR 升级:不指定 nodeids,仅升级 GR
            elif args.gr_package:
                clusterInfo.gr_upgrade(args.xmlFile, args.gr_package, target_nodeids=None)
            else:
                print("Error: at least one of --grpkg or --cmpkg must be specified")
                sys.exit(1)
        elif args.command == 'trust':
            clusterInfo.setup_ssh_trust(args.config_file)
        else:
            print("Invalid command")
            sys.exit(1)
    finally:
        # 清理临时文件
        if xml_file and os.path.exists(xml_file):
            try:
                os.unlink(xml_file)
                print(f"Cleaned up temporary XML file: {xml_file}")
            except Exception:
                pass
        
        # 如果是 upgrade 命令且成功完成,使用 os._exit(0) 强制退出
        # 避免正常的 Python 退出流程触发库析构问题导致 coredump
        # 注意:这会在 finally 之后执行,所以临时文件清理已完成
        try:
            if 'args' in locals() and args.command == 'upgrade':
                # 升级命令成功完成,强制退出避免 coredump
                os._exit(0)
        except Exception:
            pass

if __name__ == "__main__":
    main()