* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* CM is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*
* cms_az.cpp
* cms az functions
*
* IDENTIFICATION
* src/cm_server/cms_az.cpp
*
* -------------------------------------------------------------------------
*/
#include "cm/cm_elog.h"
#include "cms_global_params.h"
#include "cms_ddb.h"
#include "cms_process_messages.h"
#include "cms_common.h"
#include "cms_az.h"
#include "cm_ip.h"
static uint32 GetCurrentAZnodeNum(const char *azName);
static void StartOrStopInstanceByCommand(OperateType operateType, uint32 node, const char *instanceDataPath,
int32 timeOut = 0);
static bool IsCnDeleted(uint32 nodeId);
static void DoMultiAzStartDecision(
bool isLeaf1AZConnectOK, const char *leaf1AzName, bool isLeaf2AZConnectOK, const char *leaf2AzName);
static void DoMultiAzStopDecision(bool isLeaf1AZConnectOK, const char *leaf1AzName, bool isLeaf2AZConnectOK,
const char *leaf2AzName, bool isCmsConnectOK);
static bool SetIsolatedAzToDdb(const char *disconAzName, const char *conAzName);
static void DdbKeyOfAzConnectStatus(
const char *azName, char *azConnectStatusKey, uint32 keyLen, const char *peerAzName);
static bool SetOrGetDdbKeyValueOfAzConnectStatus(
DdbOperateType ddbOperateType, const char *key, int value, bool *operResult);
static void StartCmsNodeInstances(bool isLeaf1AZConnectOK, bool isLeaf2AZConnectOK);
static void CleanMultiConnState(const char *azName1, const char *azName2);
DdbConn g_dbConn = {0};
const int32 TRY_TIMES = 3;
const int DELAY_TIME_TO_AUTO_SWITCHOVER = 3;
az_role_string az_role_map_string[] = {{"AZ1", AZMaster}, {"AZ2", AZSlave}, {"AZ3", AZArbiter}};
* @brief Set the Stop Az Flag To Ddb object
*
* @param azRole My Param doc
* @param stopFlag My Param doc
* @return true
* @return false
*/
bool SetStopAzFlagToDdb(AZRole azRole, bool stopFlag)
{
char status_key[MAX_PATH_LEN] = {0};
char value[DDB_MIN_VALUE_LEN] = {0};
int rc = snprintf_s(status_key,
MAX_PATH_LEN,
MAX_PATH_LEN - 1,
"/%s/CMServer/StopAz/%s",
pw->pw_name,
az_role_map_string[azRole].role_string);
securec_check_intval(rc, (void)rc);
rc = snprintf_s(value, DDB_MIN_VALUE_LEN, DDB_MIN_VALUE_LEN - 1, "%d", (int)stopFlag);
securec_check_intval(rc, (void)rc);
int tryTimes = 3;
DdbConn ddbConn = g_dbConn;
status_t st = CM_SUCCESS;
do {
st = SetKVWithConn(&ddbConn, status_key, MAX_PATH_LEN, value, DDB_MIN_VALUE_LEN);
if (st != CM_SUCCESS) {
write_runlog(ERROR, "Ddb set failed. key=%s, value=%s.\n", status_key, value);
cm_sleep(1);
}
tryTimes--;
} while (st != CM_SUCCESS && tryTimes > 0);
write_runlog(LOG, "Ddb set key=%s, value=%s, result=%d.\n", status_key, value, st);
return (st == CM_SUCCESS);
}
* @brief Get the Stop Az Flag From Ddb object
*
* @param azRole My Param doc
* @return true
* @return false
*/
bool GetStopAzFlagFromDdb(AZRole azRole)
{
int rc;
char status_key[MAX_PATH_LEN] = {0};
char value[DDB_MIN_VALUE_LEN] = {0};
rc = snprintf_s(status_key, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/CMServer/StopAz/%s",
pw->pw_name, az_role_map_string[azRole].role_string);
securec_check_intval(rc, (void)rc);
DdbOption option = {SUCCESS_GET_VALUE, DEBUG1};
int32 tryTimes = TRY_TIMES;
status_t st = CM_SUCCESS;
static DDB_RESULT lastDdbResult = SUCCESS_GET_VALUE;
int32 logLevel = DEBUG1;
do {
st = GetKVConAndLog(&g_dbConn, status_key, value, DDB_MIN_VALUE_LEN, &option);
if (option.ddbResult != lastDdbResult) {
lastDdbResult = option.ddbResult;
logLevel = LOG;
}
if (option.ddbResult == CAN_NOT_FIND_THE_KEY) {
write_runlog(logLevel, "get stop az(%s) flag from Ddb, message is: %d\n",
status_key, (int)option.ddbResult);
break;
}
if (st != CM_SUCCESS) {
cm_sleep(1);
--tryTimes;
}
} while (st != CM_SUCCESS && tryTimes > 0);
if (st != CM_SUCCESS) {
logLevel = (option.ddbResult == CAN_NOT_FIND_THE_KEY) ? DEBUG1 : LOG;
write_runlog(logLevel, "get stop az(%s) flag from Ddb: %d\n", status_key, (int)option.ddbResult);
return false;
}
if (strtol(value, NULL, 10) == 1) {
return true;
}
return false;
}
* @brief
*
* @param azPriority My Param doc
* @param azRole My Param doc
* @return true
* @return false
*/
bool isAZPrioritySatisfyAZRole(uint32 azPriority, AZRole azRole)
{
bool result = false;
switch (azRole) {
case AZMaster:
result = (azPriority >= g_az_master && azPriority < g_az_slave);
break;
case AZSlave:
result = (azPriority >= g_az_slave && azPriority < g_az_arbiter);
break;
case AZArbiter:
result = (azPriority >= g_az_arbiter);
break;
default:
break;
}
return result;
}
* @brief Get the Node Index By Az Role object
*
* @param azRole My Param doc
* @return int
*/
int GetNodeIndexByAzRole(AZRole azRole)
{
uint32 node_index;
for (node_index = 0; node_index < g_node_num; node_index++) {
if (isAZPrioritySatisfyAZRole(g_node[node_index].azPriority, azRole)) {
break;
}
}
if (node_index == g_node_num) {
write_runlog(ERROR, "can not get node for az%d.\n", (azRole + 1));
return -1;
}
return (int)node_index;
}
* @brief
*
* @param azRole My Param doc
*/
void StartAZ(AZRole azRole)
{
char startAzCmd[MAX_PATH_LEN] = {0};
int node_index = GetNodeIndexByAzRole(azRole);
if (node_index == -1) {
write_runlog(ERROR, "StartAZ: can not get node for az%d.\n", (azRole + 1));
return;
}
int rc = snprintf_s(startAzCmd, MAX_PATH_LEN, MAX_PATH_LEN - 1,
"nohup cm_ctl start -z %s > /dev/null 2>&1 &",
g_node[node_index].azName);
securec_check_intval(rc, (void)rc);
rc = system(startAzCmd);
if (rc != 0) {
write_runlog(ERROR, "StartAZ failed: %s, errnum=%d, errno=%d.\n", startAzCmd, rc, errno);
} else {
write_runlog(LOG, "StartAZ success: %s.\n", startAzCmd);
}
}
* @brief
*
* @param ArbiterAZIp My Param doc
* @param azRole My Param doc
*/
void StopAZ(const char *ArbiterAZIp, AZRole azRole)
{
char stopAzCmd[MAX_PATH_LEN] = {0};
int node_index = GetNodeIndexByAzRole(azRole);
if (node_index == -1) {
write_runlog(ERROR, "StopAZ: can not get node for az%d.\n", (azRole + 1));
return;
}
int rc = snprintf_s(stopAzCmd, MAX_PATH_LEN, MAX_PATH_LEN - 1,
"pssh %s -s -H %s \"nohup cm_ctl stop -z %s > /dev/null 2>&1 &\" ",
PSSH_TIMEOUT, ArbiterAZIp, g_node[node_index].azName);
securec_check_intval(rc, (void)rc);
rc = system(stopAzCmd);
if (rc != 0) {
write_runlog(ERROR, "StopAZ failed: %s, errnum=%d, errno=%d.\n", stopAzCmd, rc, errno);
} else {
write_runlog(LOG, "StopAZ success: %s.\n", stopAzCmd);
}
}
* @brief
*
* @param sshIp My Param doc
* @param azRole My Param doc
* @return true
* @return false
*/
bool doPingAzNodes(const char *sshIp, AZRole azRole)
{
uint32 i = 0;
char pingCommand[CM_MAX_COMMAND_LEN] = {0};
int rc = 0;
int tryTimes = AZ1_AZ2_CONNECT_PING_TRY_TIMES;
do {
for (i = 0; i < g_node_num; i++) {
if (isAZPrioritySatisfyAZRole(g_node[i].azPriority, azRole)) {
const char *pingStr = GetPingStr(GetIpVersion(g_node[i].cmAgentIP[0]));
if (sshIp != NULL) {
rc = snprintf_s(pingCommand, CM_MAX_COMMAND_LEN, CM_MAX_COMMAND_LEN - 1,
"pssh %s -s -H %s \"%s %s %s\" ",
PSSH_TIMEOUT, sshIp, pingStr, g_node[i].cmAgentIP[0], PING_TIMEOUT_OPTION);
} else {
rc = snprintf_s(pingCommand, CM_MAX_COMMAND_LEN, CM_MAX_COMMAND_LEN - 1,
"%s %s %s", pingStr, g_node[i].cmAgentIP[0], PING_TIMEOUT_OPTION);
}
securec_check_intval(rc, (void)rc);
rc = system(pingCommand);
if (rc != 0) {
write_runlog(ERROR,
"Execute %s failed: system result is %d, shell result is %d, errno=%d.\n",
pingCommand, rc, WEXITSTATUS(rc), errno);
} else {
return true;
}
}
}
cm_sleep(2);
tryTimes--;
} while (tryTimes > 0);
return false;
}
* @brief do ssh az3 ssh az1 to check start file
*
* @param sshIp My Param doc
* @param azRole My Param doc
* @return true
* @return false
*/
bool doCheckAzStatus(const char *sshIp, AZRole azRole)
{
int rc = 0;
int count = 0;
int totalCount = 0;
char checkStartFile[CM_MAX_COMMAND_LONG_LEN] = {0};
for (uint32 i = 0; i < g_node_num; i++) {
const char *ping_ip = g_node[i].cmAgentIP[0];
const char *pingStr = GetPingStr(GetIpVersion(ping_ip));
if (isAZPrioritySatisfyAZRole(g_node[i].azPriority, azRole)) {
if (sshIp == NULL) {
rc = snprintf_s(checkStartFile,
CM_MAX_COMMAND_LONG_LEN,
CM_MAX_COMMAND_LONG_LEN - 1,
"%s %s %s;if [ $? == 0 ];then pssh %s -s -H %s \"ls %s \";fi;",
pingStr,
ping_ip,
PING_TIMEOUT_OPTION,
PSSH_TIMEOUT,
g_node[i].sshChannel[0],
g_cmManualStartPath);
} else {
rc = snprintf_s(checkStartFile,
CM_MAX_COMMAND_LONG_LEN,
CM_MAX_COMMAND_LONG_LEN - 1,
"%s %s %s;"
"if [ $? == 0 ];then pssh %s -s -H [%s] \" "
" echo '%s %s %s;if [ $\"\"? -eq 0 ];then touch %s/azIpcheck.flag;fi;' > %s/azIpcheck.sh;"
" sh %s/azIpcheck.sh;rm -f %s/azIpcheck.sh;"
" if test -e %s/azIpcheck.flag;then rm -f %s/azIpcheck.flag;pssh %s -s -H %s \"ls %s \";fi;"
"\";fi;",
pingStr,
sshIp,
PING_TIMEOUT_OPTION,
PSSH_TIMEOUT,
sshIp,
pingStr,
ping_ip,
PING_TIMEOUT_OPTION,
sys_log_path,
sys_log_path,
sys_log_path,
sys_log_path,
sys_log_path,
sys_log_path,
PSSH_TIMEOUT,
g_node[i].sshChannel[0],
g_cmManualStartPath);
}
securec_check_intval(rc, (void)rc);
rc = system(checkStartFile);
if (rc != -1 && WEXITSTATUS(rc) == 0) {
write_runlog(LOG, "Execute %s may success, start file exist.\n", checkStartFile);
} else if (rc != -1 && WEXITSTATUS(rc) != 0) {
write_runlog(DEBUG1,
"Execute %s may failed, start file don't exist, system result is %d, shell result is %d,"
" errno=%d.\n",
checkStartFile,
rc,
WEXITSTATUS(rc),
errno);
count++;
} else {
write_runlog(LOG, "Execute %s failed, system result is %d.\n", checkStartFile, rc);
}
if (count > AZ1_AND_AZ2_CHECK_SUCCESS_NODE_LIMIT) {
break;
}
totalCount++;
}
}
if (count > totalCount / 2 || count > AZ1_AND_AZ2_CHECK_SUCCESS_NODE_LIMIT) {
return true;
}
return false;
}
static int32 UpdateAzDnCount(int *azDnCount, int32 len, int32 azIndex)
{
if (azIndex >= len || azIndex < 0) {
return -1;
}
++(*(azDnCount + azIndex));
return 0;
}
bool GetAzIndexByGroupAndMemberIdx(int32 *azIndex, bool inCurSyncList, bool isVoteAz, uint32 groupIdx, int32 memIdx)
{
cm_instance_role_status *dnRole = &(g_instance_role_group_ptr[groupIdx].instanceMember[memIdx]);
if (g_only_dn_cluster && strlen(dnRole->azName) == 0) {
*azIndex = AZ1_INDEX;
return true;
}
bool doResult = (isVoteAz && IsCurInstanceInVoteAz(groupIdx, memIdx));
if (doResult) {
return false;
}
doResult = IsInstanceIdInSyncList(
dnRole->instanceId, &(g_instance_group_report_status_ptr[groupIdx].instance_status.currentSyncList));
if (inCurSyncList && !doResult) {
return false;
}
uint32 priority = dnRole->azPriority;
if (priority < g_az_master) {
write_runlog(ERROR, "Invalid priority: az name is %s, priority=%u.\n", dnRole->azName, priority);
*azIndex = -1;
} else if (priority >= g_az_master && priority < g_az_slave) {
*azIndex = AZ1_INDEX;
} else if (priority >= g_az_slave && priority < g_az_arbiter) {
*azIndex = AZ2_INDEX;
} else {
*azIndex = AZ3_INDEX;
}
return true;
}
* @brief Get the Dn Count Of A Z object
*
* @param azDnCount My Param doc
* @return int
*/
int GetDnCountOfAZ(int *azDnCount, int32 len, bool inCurSyncList, bool isVoteAz)
{
if (!g_multi_az_cluster) {
return -1;
}
bool doResult = false;
for (uint32 i = 0; i < g_dynamic_header->relationCount; i++) {
if (g_instance_role_group_ptr[i].instanceMember[0].instanceType != INSTANCE_TYPE_DATANODE) {
continue;
}
for (int32 j = 0; j < g_instance_role_group_ptr[i].count; ++j) {
int32 azIndex = 0;
doResult = GetAzIndexByGroupAndMemberIdx(&azIndex, inCurSyncList, isVoteAz, i, j);
if (!doResult) {
continue;
}
if (UpdateAzDnCount(azDnCount, len, azIndex) != 0) {
return -1;
}
}
}
return 0;
}
* @brief
*
* @return true
* @return false
*/
int GetAzDeploymentType(bool isVoteAz)
{
if (!g_multi_az_cluster) {
return (int)UNKNOWN_AZ_DEPLOYMENT;
}
int azDnCount[AZ_MEMBER_MAX_COUNT] = {0, 0, 0};
int ret = GetDnCountOfAZ(azDnCount, AZ_MEMBER_MAX_COUNT, false, isVoteAz);
write_runlog(DEBUG1,
"GetDnCountOfAZ: ret=%d, Az1DnCount=%d, Az2DnCount=%d, Az3DnCount=%d.\n",
ret,
azDnCount[AZ1_INDEX],
azDnCount[AZ2_INDEX],
azDnCount[AZ3_INDEX]);
if (ret == -1) {
return (int)UNKNOWN_AZ_DEPLOYMENT;
}
if (azDnCount[AZ1_INDEX] > 0 && azDnCount[AZ2_INDEX] > 0 && azDnCount[AZ3_INDEX] == 0) {
return (int)TWO_AZ_DEPLOYMENT;
} else if (azDnCount[AZ1_INDEX] > 0 && azDnCount[AZ2_INDEX] > 0 && azDnCount[AZ3_INDEX] > 0) {
return (int)THREE_AZ_DEPLOYMENT;
}
return (int)UNKNOWN_AZ_DEPLOYMENT;
}
* @brief PingIpThrdFuncMain: thread main function which is used to ping other AZ
*
* @param arg: thread parameter
*
* @return void*
*/
static void *PingIpThrdFuncMain(void *arg)
{
char command[MAXPGPATH] = {0};
char buf[MAXPGPATH];
PingCheckThreadParmInfo *info = (PingCheckThreadParmInfo *)arg;
if (info == NULL) {
write_runlog(ERROR, "PingIpThrdFuncMain: invalid argument (info is NULL)\n");
return NULL;
}
uint32 threadIndex = info->threadIdx;
uint32 nodeIndex = 0;
int ret = find_node_index_by_nodeid(info->azNode, &nodeIndex);
int rc;
if (ret != 0) {
write_runlog(ERROR, "PingIpThrdFuncMain: get node index failed!\n");
return NULL;
}
const char *pingStr = GetPingStr(GetIpVersion(g_node[nodeIndex].cmAgentIP[0]));
rc = snprintf_s(command,
MAXPGPATH,
MAXPGPATH - 1,
"%s -c 1 -w 1 %s > /dev/null;if [ $? == 0 ];then echo success;else echo fail;fi;",
pingStr, g_node[nodeIndex].cmAgentIP[0]);
securec_check_intval(rc, (void)rc);
write_runlog(DEBUG1, "ping command is %s.\n", command);
FILE *fp = popen(command, "r");
if (fp == NULL) {
write_runlog(ERROR, "popen failed\n.");
return NULL;
}
if (fgets(buf, sizeof(buf), fp) != NULL) {
if (strstr(buf, "success") != NULL) {
info->pingResultArrayRef[threadIndex] = 1;
} else {
info->pingResultArrayRef[threadIndex] = 0;
}
}
(void)pclose(fp);
return NULL;
}
* @brief CheckPingReulst: Check the ping results
*
* @param pingResultArray: a array for storing the ping results of each thread
* @param pthreadNum: the num of thread
*
* @return bool
*/
static bool CheckPingReulst(const uint32 *pingResultArray, uint32 pthreadNum)
{
bool checkResult = false;
for (uint32 i = 0; i < pthreadNum; i++) {
if (pingResultArray[i] > 0) {
checkResult = true;
break;
}
}
return checkResult;
}
* @brief MulAzThread: Create multiple thread according to the pthreadNum
*
* @param pthreadNum: the num of thread
* @param azNodes: nodes array of the az
*
* @return bool
*/
bool MulAzThread(const uint32 pthreadNum, const uint32* azNodes)
{
int err = 0;
pthread_t thr_id[MAX_PING_NODE_NUM];
uint32 threadIndex;
uint32 pingCheckResult[MAX_PING_NODE_NUM];
for (uint32 ii = 0; ii < MAX_PING_NODE_NUM; ii++) {
pingCheckResult[ii] = 0;
}
PingCheckThreadParmInfo pthreadInfo[MAX_PING_NODE_NUM];
for (uint32 j = 0; j < pthreadNum; j++) {
pthreadInfo[j].azNode = azNodes[j];
pthreadInfo[j].threadIdx = j;
pthreadInfo[j].pingResultArrayRef = pingCheckResult;
}
for (threadIndex = 0; threadIndex < pthreadNum; threadIndex++) {
err = pthread_create(&thr_id[threadIndex], NULL, PingIpThrdFuncMain, &pthreadInfo[threadIndex]);
if (err != 0) {
write_runlog(ERROR, "create thread failed.\n");
return true;
} else {
write_runlog(DEBUG1, "create thread successfully.\n");
}
}
for (threadIndex = 0; threadIndex < pthreadNum; threadIndex++) {
(void)pthread_join(thr_id[threadIndex], NULL);
}
if (CheckPingReulst(pingCheckResult, pthreadNum)) {
return true;
} else {
return false;
}
}
* Get nodes(tempNodesArray) in AZ(azName)
* @azName: AZ
* @tempNodesArray : nodes in AZ
*
*/
static void GetAzNodes(const char *azName, uint32 *tempNodesArray, uint32 arrLen)
{
uint32 azNodesArrray[arrLen];
size_t len = sizeof(uint32) * arrLen;
errno_t ret = memset_s(azNodesArrray, len, 0, len);
securec_check_errno(ret, (void)ret);
for (uint32 azIndex = 0; azIndex < g_azNum; azIndex++) {
if (strcmp(azName, g_azArray[azIndex].azName) != 0) {
continue;
}
uint32 azNodeIndex = 0;
uint32 nodeIdx = 0;
while (g_azArray[azIndex].nodes[azNodeIndex] != 0) {
if (g_azArray[azIndex].nodes[azNodeIndex] == g_currentNode->node) {
azNodeIndex++;
continue;
}
azNodesArrray[nodeIdx] = g_azArray[azIndex].nodes[azNodeIndex];
azNodeIndex++;
nodeIdx++;
if (azNodeIndex >= CM_NODE_MAXNUM) {
break;
}
}
break;
}
if (arrLen > MAX_PING_NODE_NUM) {
srand((unsigned int)time(0));
for (uint32 nodeIndex = 0; nodeIndex < MAX_PING_NODE_NUM; nodeIndex++) {
tempNodesArray[nodeIndex] = azNodesArrray[rand() % MAX_PING_NODE_NUM];
}
} else {
ret = memcpy_s(tempNodesArray, sizeof(azNodesArrray), azNodesArrray, sizeof(azNodesArrray));
securec_check_errno(ret, (void)ret);
}
}
* Get the number of nodes in current AZ
* @azName: AZ
* @currentAzNodeNum : the num of nodes in current AZ
*
*/
static uint32 GetCurrentAZnodeNum(const char *azName)
{
uint32 currentAzNodeNum = 0;
azInfo *targetAz = NULL;
uint32 nodeIdx = 0;
for (uint32 ii = 0; ii < g_azNum; ii++) {
if (strcmp(azName, g_azArray[ii].azName) == 0) {
targetAz = &g_azArray[ii];
break;
}
}
if (targetAz == NULL) {
write_runlog(ERROR, "We cannot find the target AZ(%s).\n", azName);
return 0;
}
while (targetAz->nodes[nodeIdx] != 0) {
currentAzNodeNum++;
nodeIdx++;
if (nodeIdx >= CM_NODE_MAXNUM) {
break;
}
}
return currentAzNodeNum;
}
* Get CMS node in Az
*/
static void GetCmsNode(const char *azName, uint32 *cmsNodeArray, uint32 arrLen)
{
uint32 ii;
uint32 jj;
uint32 kk;
uint32 currentAzNodeNum = GetCurrentAZnodeNum(azName);
uint32 nodeIndex;
for (kk = 0; kk < g_azNum; kk++) {
if (strcmp(azName, g_azArray[kk].azName) == 0) {
break;
}
}
uint32 cmsNodeIdx = 0;
for (ii = 0; ii < currentAzNodeNum; ii++) {
int ret = find_node_index_by_nodeid(g_azArray[kk].nodes[ii], &nodeIndex);
if (ret != 0) {
write_runlog(ERROR, "GetCmsNode: get node index failed!\n");
return;
}
for (jj = 0; jj < CM_IP_NUM; jj++) {
if (strcmp(g_node[nodeIndex].cmAgentIP[0], g_node[nodeIndex].cmServer[jj]) != 0 ||
strcmp(g_node[nodeIndex].azName, azName) != 0) {
continue;
}
write_runlog(DEBUG1, "The cms node is %u in %s.\n", nodeIndex + 1, azName);
if (cmsNodeIdx >= arrLen) {
write_runlog(ERROR, "cmsNodeIdx(%u) is more than arrlen(%u).\n", cmsNodeIdx, arrLen);
break;
}
cmsNodeArray[cmsNodeIdx] = nodeIndex + 1;
cmsNodeIdx++;
}
}
return;
}
static uint32 GetCmsPrimaryAZ(char *azName)
{
uint32 cmsPrimaryNodeId;
char value[DDB_MIN_VALUE_LEN] = {0};
errno_t rc;
char primary_key[MAX_PATH_LEN] = {0};
uint32 ii;
uint32 tryTimes = 2;
uint32 currentAzNodeNum;
uint32 nodeIndex;
rc = snprintf_s(primary_key, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/CMServer/primary_node_id", pw->pw_name);
securec_check_intval(rc, (void)rc);
DdbConn dbConn = g_dbConn;
DDB_RESULT dbResult = SUCCESS_GET_VALUE;
status_t st = CM_SUCCESS;
while (tryTimes > 0) {
st = GetKVWithCon(&dbConn, primary_key, value, DDB_MIN_VALUE_LEN, &dbResult);
if (st != CM_SUCCESS) {
write_runlog(ERROR, "/%s/CMServer/primary_node_id get Ddb error: %d\n", pw->pw_name, dbResult);
} else {
break;
}
tryTimes--;
}
if (st != CM_SUCCESS) {
return 0;
}
cmsPrimaryNodeId = (uint32)strtol(value, NULL, 10);
for (ii = 0; ii < g_azNum; ii++) {
currentAzNodeNum = GetCurrentAZnodeNum(g_azArray[ii].azName);
uint32 cmsNodeArray[CM_NODE_MAXNUM] = {0};
GetCmsNode(g_azArray[ii].azName, cmsNodeArray, CM_NODE_MAXNUM);
nodeIndex = 0;
while (cmsNodeArray[nodeIndex] != 0) {
if (cmsNodeArray[nodeIndex] == cmsPrimaryNodeId) {
rc = memcpy_s(azName, CM_AZ_NAME, g_azArray[ii].azName, CM_AZ_NAME);
write_runlog(LOG, "The cms(%u) primay az is %s.\n", cmsNodeArray[nodeIndex], azName);
securec_check_errno(rc, (void)rc);
return 1;
}
nodeIndex++;
if (nodeIndex >= currentAzNodeNum) {
break;
}
}
}
return 0;
}
* Do the operation of ping AZ
* @azName: AZ
*
*/
bool DoPingAz(const char *azName)
{
uint32 currenAzNodeNum;
int rc;
currenAzNodeNum = GetCurrentAZnodeNum(azName);
if (currenAzNodeNum == 0) {
return true;
}
* If the num of AZ-nodes more than 10, we create 10 pthreads to do the ping opereation.
* Otherwise we we create multiple pthreads to do the ping opereation according to the actual number of nodes.
*/
uint32 tempAzNodeNum;
if (currenAzNodeNum < MAX_PING_NODE_NUM) {
tempAzNodeNum = currenAzNodeNum;
} else {
tempAzNodeNum = MAX_PING_NODE_NUM;
}
* considering the result of ping self is always ok, we have not to ping self.
*/
if (strcmp(azName, g_currentNode->azName) == 0) {
currenAzNodeNum--;
tempAzNodeNum--;
if (tempAzNodeNum == 0) {
return true;
}
}
uint32 azNodes[tempAzNodeNum];
rc = memset_s(azNodes, sizeof(azNodes), 0, sizeof(azNodes));
securec_check_errno(rc, (void)rc);
GetAzNodes(azName, azNodes, currenAzNodeNum);
return MulAzThread(tempAzNodeNum, azNodes);
}
* Set the name of ddb-key about az connection status
* @azName: AZ
* @azConnectStatusKey: the ddb key of az connection status
* @peerAzName : peer az
*/
static void DdbKeyOfAzConnectStatus(const char *azName, char *azConnectStatusKey, uint32 keyLen, const char *peerAzName)
{
errno_t rcs;
char tempAzName[CM_AZ_NAME] = {0};
if (peerAzName == NULL) {
rcs = memcpy_s(tempAzName, CM_AZ_NAME, g_currentNode->azName, CM_AZ_NAME);
securec_check_errno(rcs, (void)rcs);
} else {
rcs = memcpy_s(tempAzName, CM_AZ_NAME, peerAzName, CM_AZ_NAME);
securec_check_errno(rcs, (void)rcs);
}
if (strcmp(tempAzName, azName) < 0) {
rcs = snprintf_s(azConnectStatusKey, keyLen, keyLen - 1, "%sAnd%s", tempAzName, azName);
securec_check_intval(rcs, (void)rcs);
} else if (strcmp(tempAzName, azName) > 0) {
rcs = snprintf_s(azConnectStatusKey, keyLen, keyLen - 1, "%sAnd%s", azName, tempAzName);
securec_check_intval(rcs, (void)rcs);
}
return;
}
static bool SetDdbKeyValueOfAzConnectStatus(const char *key, int value)
{
errno_t rc;
char azConnectStatusKey[MAX_PATH_LEN] = {0};
char azConnectStatusValue[DDB_MIN_VALUE_LEN] = {0};
rc = snprintf_s(azConnectStatusKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/%s", pw->pw_name, key);
securec_check_intval(rc, (void)rc);
rc = snprintf_s(azConnectStatusValue, DDB_MIN_VALUE_LEN, DDB_MIN_VALUE_LEN - 1, "%d", value);
securec_check_intval(rc, (void)rc);
int32 tryTimes = TRY_TIMES;
status_t st = CM_SUCCESS;
do {
st = SetKVWithConn(&g_dbConn, azConnectStatusKey, MAX_PATH_LEN, azConnectStatusValue, DDB_MIN_VALUE_LEN);
if (st != CM_SUCCESS) {
cm_sleep(1);
--tryTimes;
}
} while (st != CM_SUCCESS && tryTimes > 0);
if (st != CM_SUCCESS) {
write_runlog(ERROR,
"ddb set(SetOnlineStatusToDdb) failed. key=%s, value=%s.\n",
azConnectStatusKey,
azConnectStatusValue);
} else {
write_runlog(DEBUG1,
"ddb set(SetOnlineStatusToDdb) successfully. key=%s, value=%s.\n",
azConnectStatusKey,
azConnectStatusValue);
return true;
}
return false;
}
static bool GetDdbKeyValueOfAzConnectStatus(const char *key, int32 value, bool *operResult)
{
DDB_RESULT dbResult = SUCCESS_GET_VALUE;
char azConnectStatusKey[MAX_PATH_LEN] = {0};
char azConnectStatusValue[DDB_MIN_VALUE_LEN] = {0};
errno_t rc = snprintf_s(azConnectStatusKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/%s", pw->pw_name, key);
securec_check_intval(rc, (void)rc);
status_t st = CM_SUCCESS;
int32 tryTimes = TRY_TIMES;
do {
st = GetKVWithCon(&g_dbConn, azConnectStatusKey, azConnectStatusValue, DDB_MIN_VALUE_LEN, &dbResult);
if (dbResult == CAN_NOT_FIND_THE_KEY) {
break;
}
if (st != CM_SUCCESS) {
cm_sleep(1);
--tryTimes;
}
} while (st != CM_SUCCESS && tryTimes > 0);
if (st != CM_SUCCESS) {
write_runlog(ERROR,
"ddb get(SetOnlineStatusToDdb) failed. key=%s, value=%s, %d.\n",
azConnectStatusKey,
azConnectStatusValue,
dbResult);
*operResult = false;
return false;
} else {
write_runlog(DEBUG1,
"ddb get(SetOnlineStatusToDdb) successfully. key=%s, value=%s.\n",
azConnectStatusKey,
azConnectStatusValue);
*operResult = true;
}
int32 tempValue = (int32)strtol(azConnectStatusValue, NULL, 10);
if (tempValue == value) {
write_runlog(LOG, "The azConnectStatusValue is %d\n", tempValue);
return true;
} else {
write_runlog(LOG, "The azConnectStatusValue is %d\n", tempValue);
return false;
}
}
* Set or Get the key of AzConnectStatus to ddb
* @ddbOperateType: Set or Get
* @Key: the ddb key of az connection status
* @value : the ddb value of az connection status
*/
static bool SetOrGetDdbKeyValueOfAzConnectStatus(
DdbOperateType ddbOperateType, const char *key, int value, bool *operResult)
{
if (ddbOperateType == SET_DDB_AZ) {
return SetDdbKeyValueOfAzConnectStatus(key, value);
} else if (ddbOperateType == GET_DDB_AZ) {
return GetDdbKeyValueOfAzConnectStatus(key, value, operResult);
}
write_runlog(ERROR, "We do not know the specific optrate.\n");
return false;
}
* Set isolated az connect status to ddb
* @ddbOperateType: Set or Get
* @Key: the ddb key of az connection status
* @value : the ddb value of az connection status
*/
static bool SetIsolatedAzToDdb(const char *disconAzName, const char *conAzName)
{
* If the AZ is isolated, we cannot write key-value to ddb.
* We use other normal AZ to record its connection status.
*/
int rc;
char keyOfMulAzConnectStatus[MAX_PATH_LEN] = {0};
bool isSetOk;
bool isGetOk1 = false;
bool isGetOk2 = false;
int value;
static uint32 consistentTimes = 0;
static bool lastLeaf1AzSetted = false;
static bool lastLeaf2AzSetted = false;
const uint32 maxConsistentTimes = 5;
cm_sleep(AZ_START_STOP_INTERVEL);
DdbKeyOfAzConnectStatus(disconAzName, keyOfMulAzConnectStatus, MAX_PATH_LEN, NULL);
bool isLeaf1AzSetted = SetOrGetDdbKeyValueOfAzConnectStatus(GET_DDB_AZ, keyOfMulAzConnectStatus, 1, &isGetOk1);
rc = memset_s(keyOfMulAzConnectStatus, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
DdbKeyOfAzConnectStatus(disconAzName, keyOfMulAzConnectStatus, MAX_PATH_LEN, conAzName);
bool isLeaf2AzSetted = SetOrGetDdbKeyValueOfAzConnectStatus(GET_DDB_AZ, keyOfMulAzConnectStatus, 1, &isGetOk2);
if (isLeaf1AzSetted == lastLeaf1AzSetted && isLeaf2AzSetted == lastLeaf2AzSetted) {
consistentTimes++;
} else {
write_runlog(LOG, "[%s] Leaf1AzSetted[%d:%d] Leaf2AzSetted[%d:%d], ConsistentTimes is %u.\n", __FUNCTION__,
isLeaf1AzSetted, lastLeaf1AzSetted, isLeaf2AzSetted, lastLeaf2AzSetted, consistentTimes);
lastLeaf1AzSetted = isLeaf1AzSetted;
lastLeaf2AzSetted = isLeaf2AzSetted;
consistentTimes = 0;
}
if (consistentTimes < maxConsistentTimes) {
return false;
}
write_runlog(LOG, "[%s] Leaf1AzSetted[%d] Leaf2AzSetted[%d].\n", __FUNCTION__, isLeaf1AzSetted, isLeaf2AzSetted);
if (isLeaf1AzSetted && isLeaf2AzSetted) {
value = 1;
} else if (isGetOk1 && isGetOk2) {
value = 0;
} else {
write_runlog(ERROR, "Can't get edge status value from ddb.\n");
return false;
}
isSetOk = SetOrGetDdbKeyValueOfAzConnectStatus(SET_DDB_AZ, disconAzName, value, NULL);
if (!isSetOk) {
write_runlog(ERROR, "Set the isolated AZ status failed, value %d.\n", value);
return false;
}
write_runlog(LOG, "Set the isolated AZ status successfully, value %d.\n", value);
return true;
}
* Create stop node flag file when CMS node is stoppped
*/
int CreateStopNodeInstancesFlagFile(int type)
{
int rc;
int ret;
char exec_path[MAX_PATH_LEN] = {0};
char stopFlagFile[MAX_PATH_LEN] = {0};
char cmd[MAX_PATH_LEN] = {0};
if (GetHomePath(exec_path, sizeof(exec_path)) != 0) {
return -1;
}
if (type == SINGLENODE_TYPE) {
rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, "node_instances_stop");
} else {
rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, "az_node_instances_stop");
}
securec_check_intval(rc, (void)rc);
ret = snprintf_s(cmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "touch %s;chmod 600 %s", stopFlagFile, stopFlagFile);
securec_check_intval(ret, (void)ret);
ret = system(cmd);
if (ret != 0) {
write_runlog(ERROR, "CreateStopNodeInstancesFlagFile failed:%s, errnum=%d, errno=%d..\n", cmd, ret, errno);
return -1;
}
write_runlog(LOG, "CreateStopNodeInstancesFlagFile success: %s.\n", cmd);
return 0;
}
bool CheckStopFileExist(int type)
{
int rc = 0;
char exec_path[MAX_PATH_LEN] = {0};
char stopFlagFile[MAX_PATH_LEN] = {0};
if (GetHomePath(exec_path, sizeof(exec_path)) != 0) {
return false;
}
if (type == SINGLENODE_TYPE) {
rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, "node_instances_stop");
} else if (type == SINGLEAZ_TYPE) {
rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, "az_node_instances_stop");
}
securec_check_intval(rc, (void)rc);
struct stat stat_buf = {0};
if (stat(stopFlagFile, &stat_buf) == 0) {
return true;
}
return false;
}
static void CheckAndDoAzStop(const char *azName)
{
bool isGetOk = false;
cm_sleep(AZ_STOP_DELAY);
bool isAzStopped = SetOrGetDdbKeyValueOfAzConnectStatus(GET_DDB_AZ, azName, MULTIAZ_STOPPING_STATUS, &isGetOk);
if (!isGetOk) {
return;
}
if (isAzStopped) {
write_runlog(LOG, "AZ(%s) have been stopped, and we need not to stop the current AZ.\n", azName);
return;
}
StartOrStopAZ(STOP_AZ, g_currentNode->azName);
bool isSetOk =
SetOrGetDdbKeyValueOfAzConnectStatus(SET_DDB_AZ, g_currentNode->azName, MULTIAZ_STOPPING_STATUS, NULL);
if (!isSetOk) {
write_runlog(ERROR, "set ddb value failed.\n");
}
if (CreateStopNodeInstancesFlagFile(SINGLEAZ_TYPE) == -1) {
write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");
}
write_runlog(LOG, "The current az(%s) has been stopped.\n", g_currentNode->azName);
return;
}
static void CheckNumAndDoAzStop(const char *leaf1AzName)
{
bool isGetOk = false;
bool isSetOk = false;
cm_sleep(AZ_STOP_DELAY);
bool isAzStopped = SetOrGetDdbKeyValueOfAzConnectStatus(GET_DDB_AZ, leaf1AzName, MULTIAZ_STOPPING_STATUS, &isGetOk);
if (!isGetOk) {
return;
}
if (isAzStopped) {
write_runlog(LOG, "AZ(%s) have been stopped, and we need not to stop the current AZ.\n", leaf1AzName);
return;
}
if (CurAzIsNeedToStop(leaf1AzName)) {
StartOrStopAZ(STOP_AZ, g_currentNode->azName);
isSetOk =
SetOrGetDdbKeyValueOfAzConnectStatus(SET_DDB_AZ, g_currentNode->azName, MULTIAZ_STOPPING_STATUS, NULL);
if (!isSetOk) {
write_runlog(ERROR, "set ddb value failed.\n");
}
} else {
write_runlog(LOG, "After check peer AZ status, Do not decide to stop the AZ(%s).\n", g_currentNode->azName);
return;
}
if (CreateStopNodeInstancesFlagFile(SINGLEAZ_TYPE) == -1) {
write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");
}
write_runlog(LOG, "The current az(%s) has been stopped.\n", g_currentNode->azName);
return;
}
* Start Az instances when the az network connection restored
*/
static void DoMultiAzStopSingleEdge(
bool isLeaf1AZConnectOK, const char *leaf1AzName, bool isLeaf2AZConnectOK, const char *leaf2AzName)
{
uint32 ret;
char cmsPrimayAz[CM_AZ_NAME] = {0};
ret = GetCmsPrimaryAZ(cmsPrimayAz);
if (ret == 0) {
write_runlog(ERROR, "Cannot get cms-primary Az.\n");
return;
}
* In 3*AZ deployment (current_az, leaf1Az, leaf2Az)
*
* If leaf1Az or leaf2Az(disconnected) has been stopped, we do not have to further stop current_az,
* as there is no enough information to indicate current_az is "fully-isolated" fromother nodes.
*/
cm_sleep(AZ_START_STOP_INTERVEL);
bool cond1 = !isLeaf1AZConnectOK && isLeaf2AZConnectOK && (strcmp(cmsPrimayAz, leaf1AzName) == 0);
bool cond2 = isLeaf1AZConnectOK && !isLeaf2AZConnectOK && (strcmp(cmsPrimayAz, leaf2AzName) == 0);
bool cond3 = !isLeaf1AZConnectOK && isLeaf2AZConnectOK && (strcmp(cmsPrimayAz, leaf2AzName) == 0);
bool cond4 = isLeaf1AZConnectOK && !isLeaf2AZConnectOK && (strcmp(cmsPrimayAz, leaf1AzName) == 0);
if (cond1) {
CheckAndDoAzStop(leaf1AzName);
return;
} else if (cond2) {
CheckAndDoAzStop(leaf2AzName);
return;
} else if (cond3) {
CheckNumAndDoAzStop(leaf1AzName);
return;
} else if (cond4) {
CheckNumAndDoAzStop(leaf2AzName);
return;
} else {
write_runlog(LOG, "Do not need to stop any AZ.\n");
}
return;
}
* Start Az instances when the az network connection restored
*/
static void DoMultiAzStopDecision(bool isLeaf1AZConnectOK, const char *leaf1AzName, bool isLeaf2AZConnectOK,
const char *leaf2AzName, bool isCmsConnectOK)
{
* The AZ where the CMS is located is disconnected, we stop the current AZ.
*
* Perform ping detection on the current AZ where the current CMS node is located.
* If the current AZ has only one node (the node where the CMS is located),
* then the disconnection of the CMS is the disconnection of the current AZ, and we have not to ping self.
* Otherwise, we need to ping current AZ to check and deal with whether CMS-Node is disconnected
* or the current AZ is disconnected.
*/
if (CheckStopFileExist(SINGLEAZ_TYPE)) {
write_runlog(LOG, "az stop file exist, return.\n");
return;
}
if (!isLeaf1AZConnectOK && !isLeaf2AZConnectOK && !isCmsConnectOK) {
if (CheckStopFileExist(SINGLENODE_TYPE)) {
write_runlog(LOG, "node stop file exist, return.\n");
return;
}
StartOrStopNodeInstanceByCommand(STOP_AZ, g_currentNode->node);
if (CreateStopNodeInstancesFlagFile(SINGLENODE_TYPE) == -1) {
write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");
}
write_runlog(
LOG, "The %s CMS is disconnected, and the ping result is %d.\n", g_currentNode->azName, isCmsConnectOK);
return;
} else if (!isLeaf1AZConnectOK && !isLeaf2AZConnectOK) {
StartOrStopAZ(STOP_AZ, g_currentNode->azName);
if (CreateStopNodeInstancesFlagFile(SINGLEAZ_TYPE) == -1) {
write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");
}
write_runlog(LOG, "The current az(%s) is isolated and it is stopped.\n", g_currentNode->azName);
return;
}
DoMultiAzStopSingleEdge(isLeaf1AZConnectOK, leaf1AzName, isLeaf2AZConnectOK, leaf2AzName);
return;
}
* Reset the key-value of stopped Az to 0
*/
static void ResetStoppedAz()
{
int rc;
char exec_path[MAX_PATH_LEN] = {0};
char stopFlagFile[MAX_PATH_LEN] = {0};
struct stat stat_buf = {0};
if (GetHomePath(exec_path, sizeof(exec_path)) != 0) {
return;
}
rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, "az_node_instances_stop");
securec_check_intval(rc, (void)rc);
if (stat(stopFlagFile, &stat_buf) == 0) {
if (unlink(stopFlagFile) != 0) {
write_runlog(ERROR, "delete cms-node stop instances flag file: %s failed.\n", stopFlagFile);
}
}
bool isSetOk = SetOrGetDdbKeyValueOfAzConnectStatus(SET_DDB_AZ, g_currentNode->azName, AZ_STATUS_RUNNING, NULL);
if (!isSetOk) {
write_runlog(ERROR, "Set the started AZ(%s) failed.\n", g_currentNode->azName);
} else {
write_runlog(DEBUG1, "Set the started AZ(%s) successfully.\n", g_currentNode->azName);
}
return;
}
static void CheckAzStoppedStatus(const char *azName, bool *isAzStopped)
{
bool isGetOk = false;
*isAzStopped = SetOrGetDdbKeyValueOfAzConnectStatus(GET_DDB_AZ, azName, AZ_STAUTS_STOPPED, &isGetOk);
if (isGetOk && !*isAzStopped) {
write_runlog(LOG, "AZ(%s) no stopped flag in ddb.\n", azName);
} else {
write_runlog(LOG, "Get AZ(%s) stopped flag in ddb.\n", azName);
}
return;
}
* Stop Az instances when the az network is disconnected
*/
static void DoMultiAzStartDecision(
bool isLeaf1AZConnectOK, const char *leaf1AzName, bool isLeaf2AZConnectOK, const char *leaf2AzName)
{
bool doStart = false;
bool isCurAzStopped = false;
bool isLeft1AzStopped = false;
bool isLeft2AzStopped = false;
bool isExist = CheckStopFileExist(SINGLEAZ_TYPE);
if (!isExist) {
write_runlog(LOG, "No stop file exist, There is no any AZ to start.\n");
return;
} else {
write_runlog(LOG, "Stop file exist, try to start Az(%s).\n", g_currentNode->azName);
}
CheckAzStoppedStatus(g_currentNode->azName, &isCurAzStopped);
CheckAzStoppedStatus(leaf1AzName, &isLeft1AzStopped);
CheckAzStoppedStatus(leaf2AzName, &isLeft2AzStopped);
* In 3*AZ deployment (current_az, leaf1Az, leaf2Az)
*
* If the connection of current Az with Leaf1Az and Leaf2Az is OK, we have to restart the current Az.
* Or, if the leaf1Az or leaf2Az(disconnected) has been stopped, we have to restart currentAz
* to ensure that only the isolated Az is stopped.
*/
char cmsPrimayAz[CM_AZ_NAME] = {0};
uint32 ret = GetCmsPrimaryAZ(cmsPrimayAz);
if (ret == 0) {
write_runlog(ERROR, "Cannot get cms-primary Az.\n");
return;
}
if (isLeaf1AZConnectOK && isLeaf2AZConnectOK) {
StartOrStopAZ(START_AZ, g_currentNode->azName);
ResetStoppedAz();
CleanMultiConnState(g_currentNode->azName, NULL);
doStart = true;
} else if ((!isLeaf1AZConnectOK && isLeaf2AZConnectOK) ||
(isLeaf1AZConnectOK && !isLeaf2AZConnectOK)) {
if ((strcmp(cmsPrimayAz, g_currentNode->azName) == 0) || isLeft1AzStopped || isLeft2AzStopped) {
StartOrStopAZ(START_AZ, g_currentNode->azName);
ResetStoppedAz();
CleanMultiConnState(g_currentNode->azName, NULL);
doStart = true;
} else {
write_runlog(ERROR,
"cmsPrimayAz is %s, leaf1AZ is (%d: %d), leaf2AZ is (%d: %d), "
"so the current Az(%s) cannot be restarted.\n",
cmsPrimayAz,
isLeaf1AZConnectOK,
isLeft1AzStopped,
isLeaf2AZConnectOK,
isLeft2AzStopped,
g_currentNode->azName);
return;
}
}
if (doStart) {
write_runlog(LOG, "The current Az(%s) is started.\n", g_currentNode->azName);
} else {
write_runlog(LOG, "waitting Az(%s) network recovery.\n", g_currentNode->azName);
}
return;
}
bool AzPingCheck(bool *preConnStatusAZ, const char *azName1)
{
bool isAZConnectOK = (cm_server_start_mode != MAJORITY_START) || DoPingAz(azName1);
if (!isAZConnectOK) {
write_runlog(LOG, "The %s is disconnected, and the ping result is %d.\n", azName1, isAZConnectOK);
} else if (cm_server_start_mode != MAJORITY_START) {
write_runlog(
DEBUG1, "The %s connected OK, cause start mode(%d) is not majority.\n", azName1, cm_server_start_mode);
} else {
write_runlog(DEBUG1, "The %s connected OK, and the ping result is %d.\n", azName1, isAZConnectOK);
}
if (*preConnStatusAZ != isAZConnectOK) {
*preConnStatusAZ = isAZConnectOK;
return false;
}
return true;
}
static int SetMultiAzConnectStatus(const char *leaf1Az, int value)
{
char keyOfMulAzConnectStatus[MAX_PATH_LEN] = {0};
DdbKeyOfAzConnectStatus(leaf1Az, keyOfMulAzConnectStatus, MAX_PATH_LEN, NULL);
bool isSetKeyValueOK = SetOrGetDdbKeyValueOfAzConnectStatus(SET_DDB_AZ, keyOfMulAzConnectStatus, value, NULL);
if (!isSetKeyValueOK) {
write_runlog(ERROR, "Set the ddb key %s failed.\n", keyOfMulAzConnectStatus);
return -1;
}
return 0;
}
void StopCurrentAz()
{
StartOrStopAZ(STOP_AZ, g_currentNode->azName);
if (CreateStopNodeInstancesFlagFile(SINGLEAZ_TYPE) == -1) {
write_runlog(ERROR, "Create stop cms node FlagFile failed.\n");
}
write_runlog(LOG, "The current az(%s) is isolated and it is stopped.\n", g_currentNode->azName);
return;
}
static void CleanMultiConnState(const char *azName1, const char *azName2)
{
char azConnectStatusKey[MAX_PATH_LEN] = {0};
char keyOfMulAzConnectStatus[MAX_PATH_LEN] = {0};
errno_t rc = 0;
if (azName2 == NULL) {
rc = memcpy_s(keyOfMulAzConnectStatus, MAX_PATH_LEN, azName1, CM_AZ_NAME);
securec_check_errno(rc, (void)rc);
} else {
DdbKeyOfAzConnectStatus(azName1, keyOfMulAzConnectStatus, MAX_PATH_LEN, azName2);
}
DdbConn *dbCon = &g_dbConn;
if (dbCon->modId == MOD_ALL) {
dbCon = GetNextDdbConn();
}
rc = snprintf_s(azConnectStatusKey, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/%s/%s", pw->pw_name, keyOfMulAzConnectStatus);
securec_check_intval(rc, (void)rc);
status_t st = CM_SUCCESS;
int32 tryTimes = TRY_TIMES;
do {
st = DelKeyWithConn(dbCon, azConnectStatusKey, MAX_PATH_LEN);
if (st != CM_SUCCESS) {
--tryTimes;
cm_sleep(1);
}
} while (st != CM_SUCCESS && tryTimes > 0);
if (st != CM_SUCCESS) {
write_runlog(ERROR, "ddb delete (SetOnlineStatusToDdb) failed. key=%s.\n", azConnectStatusKey);
} else {
write_runlog(LOG, "ddb delete (SetOnlineStatusToDdb) successfully. key=%s.\n", azConnectStatusKey);
}
return;
}
status_t GetDdbSessionInAz(DdbConn *dbConn, int32 timeOut, const char *azNames)
{
DdbInitConfig config;
errno_t rc = memset_s(&config, sizeof(DdbInitConfig), 0, sizeof(DdbInitConfig));
securec_check_errno(rc, (void)rc);
config.type = g_dbType;
status_t st = InitDdbCfgApi(config.type, &(config.drvApiInfo), timeOut, azNames);
CM_RETURN_IFERR(st);
st = InitDdbConn(dbConn, &config);
ClearDdbCfgApi(&config.drvApiInfo, g_dbType);
return st;
}
void CreateDdbConnSession(bool lastLeft1Conn, bool lastLeft2Conn, bool lastCurAzConn)
{
if (!IsNeedSyncDdb()) {
return;
}
if (g_dbConn.modId != MOD_ALL) {
RestDdbConn(&g_dbConn, CM_ERROR, NULL);
return;
}
char *azNames = NULL;
bool lastRes = lastLeft1Conn && lastLeft2Conn && lastCurAzConn;
if (!lastRes) {
azNames = g_currentNode->azName;
}
const int32 timeOut = 6000;
status_t res = GetDdbSessionInAz(&g_dbConn, timeOut, azNames);
if (res != CM_SUCCESS) {
errno_t rc = memset_s(&g_dbConn, sizeof(DdbConn), 0, sizeof(DdbConn));
securec_check_errno(rc, (void)rc);
}
}
static bool IsNeedAzConnectStateCheck()
{
if (!g_multi_az_cluster) {
write_runlog(LOG, "The current cluster is not multi-az cluster.\n");
return false;
}
if (GetAzDeploymentType(false) != THREE_AZ_DEPLOYMENT) {
write_runlog(LOG, "The current deployment is not a CBG 3AZ scenario.\n");
return false;
}
if (g_azNum == 1) {
write_runlog(LOG, "We cannot stop single AZ.\n");
return false;
}
return true;
}
static void InitAZName(char *leaf1Az, uint32 len1, char *leaf2Az, uint32 len2)
{
int rc;
for (uint32 ii = 0; ii < g_azNum; ii++) {
if (strcmp(g_azArray[ii].azName, g_currentNode->azName) != 0 && leaf1Az[0] == '\0') {
rc = memcpy_s(leaf1Az, len1, g_azArray[ii].azName, len1);
write_runlog(DEBUG1, "The leaf1 AZ name is %s.\n", leaf1Az);
securec_check_errno(rc, (void)rc);
} else if (strcmp(g_azArray[ii].azName, g_currentNode->azName) != 0 && leaf2Az[0] == '\0') {
rc = memcpy_s(leaf2Az, len2, g_azArray[ii].azName, len2);
write_runlog(DEBUG1, "The leaf2 AZ name is %s.\n", leaf2Az);
securec_check_errno(rc, (void)rc);
}
}
}
* @brief MultiAzConnectStateCheckMain: The Thread main function of multiple AZ network connection status detection
*
* @param arg:thread parameters
*
* @return void
*/
void *MultiAzConnectStateCheckMain(void *arg)
{
if (!IsNeedAzConnectStateCheck()) {
return NULL;
}
uint32 cnt = g_loopState.count;
g_loopState.count++;
g_loopState.execStatus[cnt] = 1;
write_runlog(LOG, "[reload] MultiAzConnectStateCheckMain thread loop-index:%u.\n", cnt);
bool isLeaf1AZConnectOK = true;
bool isLeaf2AZConnectOK = true;
bool currConnStatus = true;
bool checkLeft1AZConnectOK = true;
bool checkLeft2AZConnectOK = true;
bool checkCurrAZConnectOK = true;
bool lastLeft1Conn = false;
bool lastLeft2Conn = false;
bool lastCurAzConn = false;
uint32 checkConnTimes = 0;
uint32 checkConnMax = 5;
char keyOfMulAzConnectStatus[MAX_PATH_LEN];
struct timeval beginPing = {0, 0};
struct timeval endPing = {0, 0};
long totalTime = 21;
long intervalTime = 0;
int rc;
int isSetKeyValueOK;
uint32 failedWriteTimes = 0;
uint32 maxRetryTime = 15;
thread_name = "MultiAzCheck";
char Leaf1Az[CM_AZ_NAME] = {0};
char Leaf2Az[CM_AZ_NAME] = {0};
InitAZName(Leaf1Az, CM_AZ_NAME, Leaf2Az, CM_AZ_NAME);
CleanMultiConnState(g_currentNode->azName, NULL);
for (;;) {
if (g_inReload) {
cm_sleep(AZ_START_STOP_INTERVEL);
continue;
}
checkConnTimes++;
g_loopState.execStatus[cnt] = 0;
rc = memset_s(keyOfMulAzConnectStatus, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
(void)gettimeofday(&beginPing, NULL);
checkLeft1AZConnectOK = AzPingCheck(&isLeaf1AZConnectOK, Leaf1Az);
checkLeft2AZConnectOK = AzPingCheck(&isLeaf2AZConnectOK, Leaf2Az);
checkCurrAZConnectOK = AzPingCheck(&currConnStatus, g_currentNode->azName);
if ((!isLeaf1AZConnectOK) || (!isLeaf2AZConnectOK) || (!currConnStatus)) {
write_runlog(LOG, "The AZ Conn Status %s:%d, %s:%d, %s:%d Changed this time %u, try next time.\n", Leaf1Az,
isLeaf1AZConnectOK, Leaf2Az, isLeaf2AZConnectOK, g_currentNode->azName, currConnStatus, checkConnTimes);
}
bool needContinue = false;
if ((!checkLeft1AZConnectOK) || (!checkLeft2AZConnectOK) || (!checkCurrAZConnectOK)) {
checkConnTimes = 0;
cm_sleep(AZ_START_STOP_INTERVEL);
g_loopState.execStatus[cnt] = 1;
needContinue = true;
} else {
if (checkConnTimes < checkConnMax) {
cm_sleep(AZ_START_STOP_INTERVEL);
g_loopState.execStatus[cnt] = 1;
needContinue = true;
} else {
checkConnTimes = 0;
}
}
if (isLeaf1AZConnectOK != lastLeft1Conn || isLeaf2AZConnectOK != lastLeft2Conn ||
currConnStatus != lastCurAzConn || g_dbConn.modId == MOD_ALL) {
write_runlog(LOG,
"left1(%s %d: %d), left2(%s %d: %d), cur(%s %d: %d), will open "
"new ddb Connect.\n",
Leaf1Az, lastLeft1Conn, isLeaf1AZConnectOK, Leaf2Az, lastLeft2Conn, isLeaf2AZConnectOK,
g_currentNode->azName, lastCurAzConn, currConnStatus);
lastLeft1Conn = isLeaf1AZConnectOK;
lastLeft2Conn = isLeaf2AZConnectOK;
lastCurAzConn = currConnStatus;
CreateDdbConnSession(lastLeft1Conn, lastLeft2Conn, lastCurAzConn);
}
if (!isLeaf1AZConnectOK && !isLeaf2AZConnectOK) {
* If both the isLeaf1AZConnectOK and isLeaf1AZConnectOK are false, the current az is network isolatedd.
* So, we cannot set this az connection status to ddb.
*/
} else if (!isLeaf1AZConnectOK && isLeaf2AZConnectOK) {
isSetKeyValueOK = SetMultiAzConnectStatus(Leaf1Az, MULTIAZ_STOPPING_STATUS);
if (isSetKeyValueOK == 0) {
failedWriteTimes = 0;
} else {
failedWriteTimes++;
write_runlog(ERROR, "Set the IsolatedAz %s failed.\n", Leaf1Az);
g_loopState.execStatus[cnt] = 1;
continue;
}
} else if (isLeaf1AZConnectOK && !isLeaf2AZConnectOK) {
isSetKeyValueOK = SetMultiAzConnectStatus(Leaf2Az, MULTIAZ_STOPPING_STATUS);
if (isSetKeyValueOK == 0) {
failedWriteTimes = 0;
} else {
failedWriteTimes++;
write_runlog(ERROR, "Set the IsolatedAz %s failed.\n", Leaf2Az);
g_loopState.execStatus[cnt] = 1;
continue;
}
} else if (isLeaf1AZConnectOK && isLeaf2AZConnectOK) {
failedWriteTimes = 0;
isSetKeyValueOK = SetMultiAzConnectStatus(Leaf1Az, MULTIAZ_RUNNING_STATUS);
if (isSetKeyValueOK != 0) {
write_runlog(ERROR, "Set the IsolatedAz %s failed.\n", Leaf1Az);
g_loopState.execStatus[cnt] = 1;
continue;
}
isSetKeyValueOK = SetMultiAzConnectStatus(Leaf2Az, MULTIAZ_RUNNING_STATUS);
if (isSetKeyValueOK != 0) {
write_runlog(ERROR, "Set the IsolatedAz %s failed.\n", Leaf2Az);
g_loopState.execStatus[cnt] = 1;
continue;
}
}
bool isSetIsolatedAzOk = true;
if (!isLeaf1AZConnectOK && isLeaf2AZConnectOK) {
isSetIsolatedAzOk = SetIsolatedAzToDdb(Leaf1Az, Leaf2Az);
if (!isSetIsolatedAzOk) {
write_runlog(LOG, "Try to merge the IsolatedAz %s failed.\n", Leaf1Az);
g_loopState.execStatus[cnt] = 1;
continue;
}
} else if (isLeaf1AZConnectOK && !isLeaf2AZConnectOK) {
isSetIsolatedAzOk = SetIsolatedAzToDdb(Leaf2Az, Leaf1Az);
if (!isSetIsolatedAzOk) {
write_runlog(LOG, "Try to merge the IsolatedAz %s failed.\n", Leaf2Az);
g_loopState.execStatus[cnt] = 1;
continue;
}
}
if (needContinue) {
continue;
}
if (!isLeaf1AZConnectOK || !isLeaf2AZConnectOK) {
write_runlog(LOG, "failedWriteTimes = %u, local_role = %d \n", failedWriteTimes, g_HA_status->local_role);
if (failedWriteTimes >= maxRetryTime && (g_HA_status->local_role != CM_SERVER_PRIMARY)) {
if (!CheckStopFileExist(SINGLEAZ_TYPE) && !CheckStopFileExist(SINGLENODE_TYPE)) {
write_runlog(ERROR, "ddb write failed reach max times, restart current az.\n");
StopCurrentAz();
}
cm_sleep(AZ_START_STOP_INTERVEL);
g_loopState.execStatus[cnt] = 1;
continue;
}
} else {
failedWriteTimes = 0;
}
if (isLeaf1AZConnectOK && isLeaf2AZConnectOK && currConnStatus) {
if (!CheckStopFileExist(SINGLEAZ_TYPE) && !CheckStopFileExist(SINGLENODE_TYPE)) {
cm_sleep(AZ_START_STOP_INTERVEL);
g_loopState.execStatus[cnt] = 1;
continue;
}
}
DoMultiAzStopDecision(isLeaf1AZConnectOK, Leaf1Az, isLeaf2AZConnectOK, Leaf2Az, currConnStatus);
cm_sleep(AZ_START_STOP_INTERVEL);
DoMultiAzStartDecision(isLeaf1AZConnectOK, Leaf1Az, isLeaf2AZConnectOK, Leaf2Az);
StartCmsNodeInstances(isLeaf1AZConnectOK, isLeaf2AZConnectOK);
if (isLeaf1AZConnectOK && isLeaf2AZConnectOK && currConnStatus) {
cm_sleep(AZ_START_STOP_INTERVEL);
CleanMultiConnState(Leaf1Az, g_currentNode->azName);
CleanMultiConnState(Leaf2Az, g_currentNode->azName);
CleanMultiConnState(g_currentNode->azName, NULL);
}
(void)gettimeofday(&endPing, NULL);
intervalTime = endPing.tv_sec - beginPing.tv_sec;
if (intervalTime < totalTime) {
write_runlog(DEBUG1, "The ping opretation takes time %ld seconds.\n", intervalTime);
cm_sleep((unsigned int)(totalTime - intervalTime));
} else {
write_runlog(DEBUG1, "The ping opretation takes time %ld seconds.\n", intervalTime);
cm_sleep(5);
}
g_loopState.execStatus[cnt] = 1;
}
return NULL;
}
* @brief Start Cms node when this node network connection restored
*
* @param isLeaf1AZConnectOK: Leaf1AZ connection status with current AZ
* @param isLeaf2AZConnectOK: Leaf2AZ connection status with current AZ
*
* @return void
*/
static void StartCmsNodeInstances(bool isLeaf1AZConnectOK, bool isLeaf2AZConnectOK)
{
int rc;
char exec_path[MAX_PATH_LEN] = {0};
char stopFlagFile[MAX_PATH_LEN] = {0};
if (GetHomePath(exec_path, sizeof(exec_path)) != 0) {
return;
}
rc = snprintf_s(stopFlagFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", exec_path, "node_instances_stop");
securec_check_intval(rc, (void)rc);
struct stat stat_buf = {0};
if (isLeaf1AZConnectOK && isLeaf2AZConnectOK) {
if (stat(stopFlagFile, &stat_buf) == 0) {
write_runlog(LOG, "We only need start current node(%u).\n", g_currentNode->node);
StartOrStopNodeInstanceByCommand(START_AZ, g_currentNode->node);
if (unlink(stopFlagFile) != 0) {
write_runlog(ERROR, "delete cms-node stop instances flag file: %s failed.\n", stopFlagFile);
} else {
write_runlog(LOG, "delete cms-node stop instances flag file: %s successfully.\n", stopFlagFile);
}
}
}
return;
}
void StartOrStopAZ(OperateType operateType, const char *azName)
{
for (uint32 ii = 0; ii < g_azNum; ii++) {
if (strcmp(azName, g_azArray[ii].azName) != 0) {
continue;
}
uint32 jj = 0;
while (g_azArray[ii].nodes[jj] != 0) {
StartOrStopNodeInstanceByCommand(operateType, g_azArray[ii].nodes[jj]);
jj++;
if (jj >= CM_NODE_MAXNUM) {
break;
}
}
break;
}
return;
}
void StartOrStopNodeInstanceByCommand(OperateType operateType, uint32 nodeId)
{
uint32 nodeIndex;
int ret = find_node_index_by_nodeid(nodeId, &nodeIndex);
if (ret != 0) {
write_runlog(ERROR, "[%s] get node index failed!\n", __FUNCTION__);
return;
}
if (g_node[nodeIndex].coordinate == 1) {
if (operateType == START_AZ && IsCnDeleted(nodeId)) {
const int32 timeOut = 30;
StartOrStopInstanceByCommand(operateType, nodeId, g_node[nodeIndex].DataPath, timeOut);
} else {
StartOrStopInstanceByCommand(operateType, nodeId, g_node[nodeIndex].DataPath);
}
}
if (g_node[nodeIndex].gtm == 1) {
StartOrStopInstanceByCommand(operateType, nodeId, g_node[nodeIndex].gtmLocalDataPath);
}
for (uint32 ii = 0; ii < g_node[nodeIndex].datanodeCount; ii++) {
StartOrStopInstanceByCommand(
operateType, nodeId, g_node[nodeIndex].datanode[ii].datanodeLocalDataPath);
}
return;
}
static void StartOrStopInstanceByCommand(OperateType operateType, uint32 node, const char *instanceDataPath,
int32 timeOut)
{
int ret;
errno_t rc;
char cmd[MAXPGPATH] = {0};
uint32 tryTimes = 2;
if (operateType == START_AZ) {
if (timeOut == 0) {
rc = snprintf_s(cmd, MAXPGPATH, MAXPGPATH - 1, "cm_ctl start -n %u -D %s > /dev/null 2>&1 &", node,
instanceDataPath);
} else {
rc = snprintf_s(cmd, MAXPGPATH, MAXPGPATH - 1, "cm_ctl start -n %u -D %s -t %d > /dev/null 2>&1 &", node,
instanceDataPath, timeOut);
}
} else if (operateType == STOP_AZ) {
rc = snprintf_s(cmd, MAXPGPATH, MAXPGPATH - 1, "cm_ctl stop -n %u -D %s -m i > /dev/null 2>&1 &", node,
instanceDataPath);
} else {
write_runlog(ERROR, "Invalid start-stop command, please recheck it.\n");
return;
}
securec_check_intval(rc, (void)rc);
while (tryTimes > 0) {
ret = system(cmd);
write_runlog(DEBUG1, "Call system command(%s) to execute node start and stop.\n", cmd);
if (ret != 0) {
write_runlog(ERROR, "StartOrStopInstanceByCommand failed:%s, errnum:%d, errno=%d.\n", cmd, ret, errno);
tryTimes--;
cm_sleep(1);
} else {
write_runlog(LOG, "StartOrStopInstanceByCommand successfully: %s.\n", cmd);
break;
}
}
return;
}
* @brief IsCnDeleted: Judge whether the CN of the node has been deleted
*
* @param nodeId: node
*
* @return bool
*/
static bool IsCnDeleted(uint32 nodeId)
{
cm_instance_role_status *member = NULL;
for (uint32 i = 0; i < g_dynamic_header->relationCount; i++) {
if (g_instance_role_group_ptr[i].instanceMember[0].node == nodeId &&
g_instance_role_group_ptr[i].instanceMember[0].instanceType == INSTANCE_TYPE_COORDINATE) {
member = &g_instance_role_group_ptr[i].instanceMember[0];
break;
}
}
if (member == NULL) {
write_runlog(ERROR, "can't find cn in node %u.\n", nodeId);
return false;
}
if (member->role == INSTANCE_ROLE_DELETED || member->role == INSTANCE_ROLE_DELETING) {
write_runlog(LOG, "The CN of node %u have been deleted.\n", nodeId);
return true;
}
write_runlog(LOG, "The CN of node %u have not been deleted.\n", nodeId);
return false;
}