Bbowenliu1030 CM patch
7ab5b2eb创建于 2022年11月7日历史提交
/*

 * Copyright (c) 2021 Huawei Technologies Co.,Ltd.

 *

 * CM is licensed under Mulan PSL v2.

 * You can use this software according to the terms and conditions of the Mulan PSL v2.

 * You may obtain a copy of Mulan PSL v2 at:

 *

 *          http://license.coscl.org.cn/MulanPSL2

 *

 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,

 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,

 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.

 * See the Mulan PSL v2 for more details.

 * -------------------------------------------------------------------------

 *

 * cms_az_check_network.cpp

 *    AZ net check main

 *

 * IDENTIFICATION

 *    src/cm_server/cms_az_check_network.cpp

 *

 * -------------------------------------------------------------------------

 */



#include "cm/cm_elog.h"

#include "cms_global_params.h"

#include "cms_ddb.h"

#include "cms_az.h"

#include "cms_process_messages.h"

#include "cms_common.h"



static void GetLeafAzName(ConnCheck *leaf1Az, ConnCheck *curAz)

{

    errno_t rc = 0;

    for (int32 i = 0; i < AZ_MEMBER_MAX_COUNT; ++i) {

        write_runlog(LOG, "azName(%s), dnCount is %u, azPriority is %u.\n",

            g_cmAzInfo[i].azName, g_cmAzInfo[i].dnCount, g_cmAzInfo[i].azPriority);

        if (g_cmAzInfo[i].dnCount == 0) {

            continue;

        }

        if (strcmp(g_cmAzInfo[i].azName, g_currentNode->azName) == 0) {

            rc = memcpy_s(curAz->azName, CM_AZ_NAME, g_cmAzInfo[i].azName, CM_AZ_NAME);

            securec_check_errno(rc, (void)rc);

            curAz->azPriority = g_cmAzInfo[i].azPriority;

            write_runlog(LOG, "curAz azname(%s), azPriority is %u.\n", curAz->azName, curAz->azPriority);

            continue;

        }

        rc = memcpy_s(leaf1Az->azName, CM_AZ_NAME, g_cmAzInfo[i].azName, CM_AZ_NAME);

        securec_check_errno(rc, (void)rc);

        leaf1Az->azPriority = g_cmAzInfo[i].azPriority;

        write_runlog(LOG, "leaf1Az azname(%s), azPriority is %u.\n", leaf1Az->azName, leaf1Az->azPriority);

    }

}



static bool GetAzRoleByAzPriority(uint32 azPriority, AZRole *azRole)

{

    if (azPriority >= g_az_master && azPriority < g_az_slave) {

        *azRole = AZMaster;

        return true;

    } else if (azPriority >= g_az_slave && azPriority < g_az_arbiter) {

        *azRole = AZSlave;

        return true;

    } else if (azPriority >= g_az_arbiter) {

        *azRole = AZArbiter;

        return true;

    }

    return false;

}



static bool GetLeafAzRole(ConnCheck *leaf1Az, ConnCheck *curAz)

{

    bool res = GetAzRoleByAzPriority(leaf1Az->azPriority, &(leaf1Az->azRole));

    if (!res) {

        write_runlog(

            ERROR, "leaf1Az az(%s) can not get azRole by azPriority(%u).\n", leaf1Az->azName, leaf1Az->azPriority);

        return false;

    }



    res = GetAzRoleByAzPriority(curAz->azPriority, &(curAz->azRole));

    if (!res) {

        write_runlog(ERROR, "curAz az(%s) can not get azRole by azPriority(%u).\n", curAz->azName, curAz->azPriority);

        return false;

    }

    return true;

}



static bool CheckLeafAzName(const ConnCheck *leaf1Az, const ConnCheck *leaf2Az, const ConnCheck *curAz)

{

    if ((strlen(leaf1Az->azName) == 0) || (leaf1Az->azPriority == 0)) {

        write_runlog(

            ERROR, "leaf1Az az name(%s) or azPriority(%u) is invalid.\n", leaf1Az->azName, leaf1Az->azPriority);

        return false;

    }

    if ((strlen(leaf2Az->azName) == 0) || (leaf2Az->azPriority == 0)) {

        write_runlog(

            ERROR, "leaf2Az az name(%s) or azPriority(%u) is invalid.\n", leaf2Az->azName, leaf2Az->azPriority);

        return false;

    }

    if ((strlen(curAz->azName) == 0) || (curAz->azPriority == 0)) {

        write_runlog(ERROR, "curAz az name(%s) or azPriority(%u) is invalid.\n", curAz->azName, curAz->azPriority);

        return false;

    }

    return true;

}



static bool InitConnCheck(ConnCheck *leaf1Az, ConnCheck *leaf2Az, ConnCheck *curAz, int32 nodeIdx)

{

    GetLeafAzName(leaf1Az, curAz);

    errno_t rc = memcpy_s(leaf2Az->azName, CM_AZ_NAME, g_node[nodeIdx].azName, CM_AZ_NAME);

    securec_check_errno(rc, (void)rc);

    leaf2Az->azPriority = g_node[nodeIdx].azPriority;

    leaf2Az->azRole = AZArbiter;

    bool res = CheckLeafAzName(leaf1Az, leaf2Az, curAz);

    if (!res) {

        return false;

    }

    res = GetLeafAzRole(leaf1Az, curAz);

    if (!res) {

        return false;

    }

    write_runlog(LOG, "leaf1Az(%s %u:%d), curAz(%s %u:%d), leaf2Az(%s %u:%d).\n",

        leaf1Az->azName, leaf1Az->azPriority, leaf1Az->azRole, curAz->azName, curAz->azPriority, curAz->azRole,

        leaf2Az->azName, leaf2Az->azPriority, leaf2Az->azRole);

    return true;

}



static AzPingCheckRes AzAndInnerConnectCheck(ConnCheck *leaf1Az, ConnCheck *leaf2Az, ConnCheck *curAz,

    int32 *checkTimes)

{

    const int32 checkConnMax = 5;

    bool leaf1AzConnectOK = AzPingCheck(&(leaf1Az->curConn), leaf1Az->azName);

    bool curAZConnectOK = AzPingCheck(&(curAz->curConn), curAz->azName);

    bool leaf2AzConnectOK = AzPingCheck(&(leaf2Az->curConn), leaf2Az->azName);

    if ((!leaf1Az->curConn) || (!curAz->curConn) || (!leaf2Az->curConn)) {

        write_runlog(LOG, "The AZ Conn Status %s:%d, %s:%d, %s:%d Changed this time  %d, try next time.\n",

            leaf1Az->azName, leaf1Az->curConn, curAz->azName, curAz->curConn, leaf2Az->azName, leaf2Az->lastConn,

            *checkTimes);

    }

    if ((!leaf1AzConnectOK) || (!curAZConnectOK) || (!leaf2AzConnectOK)) {

        *checkTimes = 0;

        return WAIT_NEXT_TIME;

    } else {

        if (((++(*checkTimes)) % checkConnMax) != 0) {

            return WAIT_NEXT_TIME;

        } else {

            *checkTimes = 0;

        }

    }

    if ((leaf1Az->curConn != leaf1Az->lastConn) || (curAz->curConn != curAz->lastConn) ||

        (leaf2Az->curConn != leaf2Az->lastConn) || (g_dbConn.modId == MOD_ALL)) {

        write_runlog(LOG,

            "leaf1Az(%s %d: %d), curAz(%s %d: %d), leaf2Az(%s %d: %d), will open "

            "new ddb Connect.\n",

            leaf1Az->azName, leaf1Az->lastConn, leaf1Az->curConn, curAz->azName, curAz->lastConn, curAz->curConn,

            leaf2Az->azName, leaf2Az->lastConn, leaf2Az->curConn);

        leaf1Az->lastConn = leaf1Az->curConn;

        curAz->lastConn = curAz->curConn;

        leaf2Az->lastConn = leaf2Az->curConn;

        CreateDdbConnSession(leaf1Az->lastConn, leaf2Az->lastConn, curAz->lastConn);

    }

    return CONTINUE_EXECTING;

}



void UnlinkStopFile(const int type)

{

    int rc = 0;

    char execPath[MAX_PATH_LEN] = {0};

    char stopFlagFile[MAX_PATH_LEN] = {0};

    struct stat statBuf = {0};

    if (GetHomePath(execPath, sizeof(execPath)) != 0) {

        return;

    }

    if (type == SINGLEAZ_TYPE) {

        rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", execPath, "az_node_instances_stop");

    } else if (type == SINGLENODE_TYPE) {

        rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", execPath, "node_instances_stop");

    }

    securec_check_intval(rc, (void)rc);

    if (stat(stopFlagFile, &statBuf) == 0) {

        if (unlink(stopFlagFile) != 0) {

            write_runlog(ERROR, "delete cms-node stop instances flag file: %s failed.\n", stopFlagFile);

        }

    }

}



static void DoStartCurNodeOrCurAz(const ConnCheck *curAz)

{

    if (curAz->curConn) {

        if (CheckStopFileExist(SINGLEAZ_TYPE)) {

            write_runlog(LOG, "check the single_az file, we should start current az.\n");

            StartOrStopAZ(START_AZ, curAz->azName);

            UnlinkStopFile(SINGLEAZ_TYPE);

        }

    }

    if (CheckStopFileExist(SINGLENODE_TYPE)) {

        write_runlog(LOG, "check the single_node file, We only need start current node(%u).\n", g_currentNode->node);

        StartOrStopNodeInstanceByCommand(START_AZ, g_currentNode->node);

        UnlinkStopFile(SINGLENODE_TYPE);

    }

}



static void CheckStopAzByArbitrate(const ConnCheck *leafAz, const ConnCheck *curAz, int32 nodeIdx)

{

    bool leafAzHasBeenStop = GetStopAzFlagFromDdb(leafAz->azRole);

    bool leafHealthState = doCheckAzStatus(g_node[nodeIdx].sshChannel[0], leafAz->azRole);

    if (leafHealthState && !leafAz->curConn) {

        if (SetStopAzFlagToDdb(leafAz->azRole, true)) {

            StopAZ(g_node[nodeIdx].sshChannel[0], leafAz->azRole);

            write_runlog(LOG, "%s and %s is disconnected, %s is available, stop %s.\n",

                leafAz->azName, curAz->azName, curAz->azName, leafAz->azName);

        } else {

            write_runlog(ERROR, "set stop %s key failed, can not stop %s.\n", leafAz->azName, leafAz->azName);

        }

    }

    if ((!leafHealthState) && (leafAz->curConn)) {

        if (leafAzHasBeenStop) {

            write_runlog(LOG, "%s and %s connection is ok, start %s now.\n", leafAz->azName,

                curAz->azName, leafAz->azName);

            if (SetStopAzFlagToDdb(leafAz->azRole, false)) {

                StartAZ(leafAz->azRole);

            } else {

                write_runlog(ERROR, "clear az1 stop flag failed, can not start %s.\n", leafAz->azName);

            }

        } else {

            write_runlog(LOG, "az1 may be stopped by user with cm_ctl stop -z.\n");

        }

    }

}



static void CheckStartAndStopInPrimary(

    const ConnCheck *leaf1Az, const ConnCheck *curAz, const ConnCheck *leaf2Az, int32 nodeIdx)

{

    DoStartCurNodeOrCurAz(curAz);



    if (!leaf2Az->curConn) {

        write_runlog(LOG, "%s and %s is disConnected, can not check and do stop %s.\n",

            leaf2Az->azName, curAz->azName, leaf1Az->azName);

        return;

    }



    CheckStopAzByArbitrate(leaf1Az, curAz, nodeIdx);

}



static void DoStopCurNodeOrCurAz(const ConnCheck *curAz)

{

    if (!curAz->curConn) {

        if (CheckStopFileExist(SINGLENODE_TYPE)) {

            return;

        }

        StartOrStopNodeInstanceByCommand(STOP_AZ, g_currentNode->node);

        if (CreateStopNodeInstancesFlagFile(SINGLENODE_TYPE) == -1) {

            write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");

        }

        write_runlog(LOG, "The %s CMS is disconnected, and the ping result is %d.\n", curAz->azName, curAz->curConn);

    } else {

        if (CheckStopFileExist(SINGLEAZ_TYPE)) {

            return;

        }

        StartOrStopAZ(STOP_AZ, curAz->azName);

        if (CreateStopNodeInstancesFlagFile(SINGLEAZ_TYPE) == -1) {

            write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");

        }

        write_runlog(LOG, "The current az(%s) is isolated and it is stopped.\n", curAz->azName);

    }

}



static void CheckStartAndStopInStandby(const ConnCheck *leaf1Az, const ConnCheck *leaf2Az, const ConnCheck *curAz)

{

    if (!leaf1Az->curConn && !leaf2Az->curConn) {

        DoStopCurNodeOrCurAz(curAz);

    }

    if (leaf1Az->curConn && leaf2Az->curConn) {

        DoStartCurNodeOrCurAz(curAz);

    }

}



void *BothAzConnectStateCheckMain(void *arg)

{

    if (GetAzDeploymentType(false) != TWO_AZ_DEPLOYMENT) {

        write_runlog(LOG, "BothAzConnectStateCheckMain exit.\n");

        return NULL;

    }



    int arbitNodeIdx = GetNodeIndexByAzRole(AZArbiter);

    if (arbitNodeIdx == -1) {

        write_runlog(ERROR, "can not get node in az3, BothAzConnectStateCheckMain exit.\n");

        return NULL;

    }

    thread_name = "BothAzCheck";

    uint32 cnt = g_loopState.count;

    g_loopState.count++;

    g_loopState.execStatus[cnt] = 1;

    write_runlog(LOG, "[reload] BothAzConnectStateCheckMain thread loop-index:%u.\n", cnt);



    ConnCheck leaf1Az = {0};

    ConnCheck leaf2Az = {0};

    ConnCheck curAz = {0};

    bool res = InitConnCheck(&leaf1Az, &leaf2Az, &curAz, arbitNodeIdx);

    if (!res) {

        write_runlog(ERROR, "can not InitConnCheck, BothAzConnectStateCheckMain exit.\n");

        return NULL;

    }

    int32 checkTimes = 0;

    AzPingCheckRes pingRes = CONTINUE_EXECTING;

    for (;;) {

        if (g_inReload) {

            cm_sleep(AZ_START_STOP_INTERVEL);

            continue;

        }

        g_loopState.execStatus[cnt] = 0;

        pingRes = AzAndInnerConnectCheck(&leaf1Az, &leaf2Az, &curAz, &checkTimes);

        if (pingRes == WAIT_NEXT_TIME) {

            g_loopState.execStatus[cnt] = 1;

            cm_sleep(AZ_START_STOP_INTERVEL);

            continue;

        }

        if (g_HA_status->local_role == CM_SERVER_PRIMARY) {

            CheckStartAndStopInPrimary(&leaf1Az, &curAz, &leaf2Az, arbitNodeIdx);

            g_loopState.execStatus[cnt] = 1;

            cm_sleep(AZ_START_STOP_INTERVEL);

            continue;

        }

        CheckStartAndStopInStandby(&leaf1Az, &leaf2Az, &curAz);

        g_loopState.execStatus[cnt] = 1;

        cm_sleep(AZ_START_STOP_INTERVEL);

    }

}