* Copyright (c) 2021 Huawei Technologies Co.,Ltd.
*
* CM is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*
* monitor_main.cpp
* om_moniter check cm_agent and etcd.
*
* IDENTIFICATION
* src/cm_monitor/monitor_main.cpp
*
* -------------------------------------------------------------------------
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/wait.h>
#include <unistd.h>
#include <dirent.h>
#include <sys/procfs.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/file.h>
#include "cm/pqsignal.h"
#include "cm/stringinfo.h"
#include "cm/cm_elog.h"
#include "cm/cm_cgroup.h"
#include "cm/cm_misc.h"
#include "common/config/cm_config.h"
#include "getopt_long.h"
#include "alarm/alarm.h"
#include <sys/mman.h>
#include <mntent.h>
#include "utils/syscall_lock.h"
#include "alarm/alarm_log.h"
#include "config.h"
#include "cm/cm_c.h"
pid_t g_cmAgentPid = 0;
#define LOGIC_CLUSTER_LIST "logic_cluster_name.txt"
#define MAX_PORT_LEN (8)
#define TRY_COUNT_FOR_KILL_ETCD_REPLACE (5)
#define LEN_TIMESTAMP (18)
#define CM_AGENT_PID_FILE "cm_agent.pid"
#define CM_AGENT_CONFIG "cm_agent.conf"
#define MONITOR_CHECK_INTERVAL (1)
#define HANG_T_DETECT_MAX_TIMES (3)
#ifdef ENABLE_MULTIPLE_NODES
typedef enum { PROCKIND_ETCD, PROCKIND_CMAGENT, PROCKIND_MONITOR, PROCKIND_MAX } ProcessKind;
#else
typedef enum { PROCKIND_ETCD, PROCKIND_CMAGENT, PROCKIND_MONITOR, PROCKIND_ITRAN, PROCKIND_MAX } ProcessKind;
#endif
static char g_cmAgentBinPath[MAX_PATH_LEN];
static char g_etcdBinPath[MAX_PATH_LEN];
char g_cmManualStartPath[MAX_PATH_LEN];
static char g_etcdManualStartPath[MAX_PATH_LEN];
#ifndef ENABLE_MULTIPLE_NODES
char g_ltranManualStartPath[MAX_PATH_LEN];
char g_libnetManualStartFile[MAX_PATH_LEN];
#endif
static char g_etcdReplacedPath[MAX_PATH_LEN];
static char g_cmUpgradeManualStartPath[MAX_PATH_LEN];
static char g_cmRollbackManualStartPath[MAX_PATH_LEN];
static char g_cmStaticConfigChangeFlagFilePath[MAX_PATH_LEN];
char g_cmStaticConfigurePath[MAX_PATH_LEN];
char g_logicClusterListPath[MAX_PATH_LEN];
static char g_noCgroupFlag[MAX_PATH_LEN];
static char g_agentConfigPath[MAXPGPATH];
static char g_alarmConfigPath[MAX_PATH_LEN];
Alarm *g_startupAlarmList = NULL;
int g_startupAlarmListSize = 0;
bool g_isStart = false;
static bool g_isAttachToCgroup = false;
static int g_myProcPid = 0;
int g_tcpKeepalivesIdle = 0;
int g_tcpKeepalivesInterval = 0;
int g_tcpKeepalivesCount = 0;
static uid_t g_myUid = 0;
static int g_previousStatus = 0;
static int g_agentFaultCount = 0;
static char g_monitorLockfile[MAX_PATH_LEN] = {0};
FILE *g_lockfile = NULL;
const char *g_progname;
extern char g_curLogFileName[MAXPGPATH];
extern char sys_log_path[MAX_PATH_LEN];
extern FILE *syslogFile;
extern volatile int maxLogFileSize;
extern bool g_logFileSet;
extern char system_alarm_log[MAXPGPATH];
char *g_logFile;
EtcdTlsAuthPath g_tlsPath = {0};
static char g_curEtcdLogFile[MAXPGPATH] = {0};
static char g_etcdLogPath[MAX_PATH_LEN] = {0};
static int g_startEtcdCount = 0;
static int g_replaceEtcdCount = 0;
static void check_ETCD_process_status(AlarmAdditionalParam *additionalParam, const char *userName);
int check_process_status(ProcessKind type, pid_t parentPid, bool *isKillProcess = NULL);
static int MonitorLock(bool isKillProcess);
static int MonitorUnlock(void);
* @brief
* Check whether the CM Agent meets the startup conditions.
*
* @return
* Return whether the CM Agent meets the startup conditions.
*/
static bool check_start_request();
int cmmonitor_getenv(const char *env_var, char *output_env_value, uint32 env_value_len)
{
if (env_var == NULL) {
(void)fprintf(stderr, "cmmonitor_getenv: invalid env_var !\n");
return -1;
}
(void)syscalllockAcquire(&g_cmEnvLock);
char *env_value = getenv(env_var);
if (env_value == NULL || env_value[0] == '\0') {
(void)fprintf(stderr,
"cmmonitor_getenv: failed to get environment variable:%s. Please check and make sure it is configured!\n",
env_var);
(void)syscalllockRelease(&g_cmEnvLock);
return -1;
}
CheckEnvValue(env_value);
int rc = strcpy_s(output_env_value, env_value_len, env_value);
if (rc != EOK) {
(void)fprintf(stderr,
"cmmonitor_getenv: failed to get environment variable:%s, variable length:%lu.\n",
env_var,
strlen(env_value));
(void)syscalllockRelease(&g_cmEnvLock);
return -1;
}
(void)syscalllockRelease(&g_cmEnvLock);
return EOK;
}
void StartupAlarmItemInitialize(const staticNodeConfig *currentNode)
{
g_startupAlarmListSize = 1;
if (currentNode->etcd) {
g_startupAlarmListSize += 1;
}
g_startupAlarmList = (Alarm *)malloc(sizeof(Alarm) * (size_t)g_startupAlarmListSize);
if (g_startupAlarmList == NULL) {
AlarmLog(ALM_LOG, "Out of memory: StartupAlarmItemInitialize failed.\n");
exit(1);
}
int alarmIndex = g_startupAlarmListSize - 1;
AlarmItemInitialize(&(g_startupAlarmList[alarmIndex]), ALM_AI_AbnormalCMAProcess, ALM_AS_Init, NULL);
--alarmIndex;
for (; alarmIndex >= 0; --alarmIndex) {
AlarmItemInitialize(&(g_startupAlarmList[alarmIndex]), ALM_AI_AbnormalETCDProcess, ALM_AS_Init, NULL);
}
}
void GetCmdlineOpt(int argc, char *const argv[])
{
long logChoice = 0;
const int base = 10;
Assert(log_destion_choice == LOG_DESTION_FILE);
if (argc > 1) {
logChoice = strtol(argv[1], NULL, base);
switch (logChoice) {
case LOG_DESTION_STDERR:
log_destion_choice = LOG_DESTION_FILE;
break;
case LOG_DESTION_SYSLOG:
log_destion_choice = LOG_DESTION_SYSLOG;
break;
case LOG_DESTION_FILE:
log_destion_choice = LOG_DESTION_FILE;
break;
case LOG_DESTION_DEV_NULL:
log_destion_choice = LOG_DESTION_DEV_NULL;
break;
default:
log_destion_choice = LOG_DESTION_FILE;
break;
}
}
}
static const char *GetProcessName(ProcessKind type)
{
switch (type) {
case PROCKIND_ETCD:
return "etcd";
case PROCKIND_CMAGENT:
return "cm_agent";
case PROCKIND_MONITOR:
return "om_monitor";
#ifndef ENABLE_MULTIPLE_NODES
case PROCKIND_ITRAN:
return "ltran";
#endif
default:
return "UNKOWN";
}
}
static bool IsNeedKillProcess(const char *processName, int pid, int ppid)
{
if (strcmp(processName, "cm_agent") != 0) {
return true;
}
eg: check_cmd in etcd_disk_quota_check(), commandstr in IsMyPostmasterPid(), called by cmagent by popen() */
if (ppid == g_cmAgentPid) {
return false;
}
char cmdPath[MAX_PATH_LEN] = {0};
char getBuff[MAX_PATH_LEN] = {0};
const char *parameter[] = {"-h", "--help", "-?", "-V", "--version", NULL};
int rcs = snprintf_s(cmdPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/proc/%d/cmdline", pid);
securec_check_intval(rcs, (void)rcs);
FILE *fp = fopen(cmdPath, "r");
if (fp == NULL) {
return true;
}
bool isNeedKillProcess = true;
if (fgets(getBuff, MAX_PATH_LEN - 1, fp) != NULL) {
char *temp = getBuff + strlen(getBuff) + 1;
for (int i = 0; parameter[i] != NULL; i++) {
if (strcmp(temp, parameter[i]) == 0) {
isNeedKillProcess = false;
break;
}
}
}
(void)fclose(fp);
return isNeedKillProcess;
}
static void SetKillProcessValue(bool *isKillProcess)
{
if (isKillProcess != NULL) {
*isKillProcess = true;
}
}
int check_process_status(ProcessKind type, pid_t parentPid, bool *isKillProcess)
{
struct dirent *de;
char pid_path[MAX_PATH_LEN];
FILE *fp = NULL;
char getBuff[MAX_PATH_LEN];
char paraName[MAX_PATH_LEN];
char paraValue[MAX_PATH_LEN];
int tgid = 0;
int spid = 0;
int pid = 0;
int ppid = 0;
char state = '0';
uid_t uid = 0;
uid_t uid1 = 0;
uid_t uid2 = 0;
uid_t uid3 = 0;
bool nameFound = false;
bool nameGet = false;
bool tgidGet = false;
bool spidGet = false;
bool ppidGet = false;
bool stateGet = false;
bool haveFound = false;
bool uidGet = false;
errno_t rc;
int rcs;
const char *processName = GetProcessName(type);
bool isProcessFile = false;
DIR *dir = opendir("/proc");
if (dir == NULL) {
write_runlog(ERROR, "opendir(/proc) failed! \n ");
return -1;
}
while ((de = readdir(dir)) != NULL) {
* judging whether the directory name is composed by digitals, if so, we will
* check whether there are files under the directory , these files includes
* all detailed information about the process.
*/
if (CM_is_str_all_digit(de->d_name) != 0) {
continue;
}
isProcessFile = true;
MONITOR_RETRIES:
rc = memset_s(pid_path, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
pid = (int)strtol(de->d_name, NULL, 10);
{
rcs = snprintf_s(pid_path, MAX_PATH_LEN, MAX_PATH_LEN - 1, "/proc/%d/status", pid);
securec_check_intval(rcs, (void)rcs);
}
fp = fopen(pid_path, "r");
if (fp == NULL) {
continue;
}
nameGet = false;
tgidGet = false;
spidGet = false;
ppidGet = false;
stateGet = false;
uidGet = false;
rc = memset_s(paraValue, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
tgid = 0;
spid = 0;
ppid = 0;
state = '0';
uid = 0;
rc = memset_s(getBuff, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
nameFound = false;
while (fgets(getBuff, MAX_PATH_LEN - 1, fp) != NULL) {
rc = memset_s(paraName, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
if (!nameGet && (strstr(getBuff, "Name:") != NULL)) {
nameGet = true;
rcs = sscanf_s(getBuff, "%s %s", paraName, MAX_PATH_LEN, paraValue, MAX_PATH_LEN);
check_sscanf_s_result(rcs, 2);
securec_check_intval(rcs, (void)rcs);
if (strcmp(processName, paraValue) == 0) {
nameFound = true;
} else {
break;
}
}
if (!tgidGet && (strstr(getBuff, "Tgid:") != NULL)) {
tgidGet = true;
rcs = sscanf_s(getBuff, "%s %d", paraName, MAX_PATH_LEN, &tgid);
check_sscanf_s_result(rcs, 2);
securec_check_intval(rcs, (void)rcs);
}
if (!spidGet && (strstr(getBuff, "Pid:") != NULL)) {
spidGet = true;
rcs = sscanf_s(getBuff, "%s %d", paraName, MAX_PATH_LEN, &spid);
check_sscanf_s_result(rcs, 2);
securec_check_intval(rcs, (void)rcs);
}
if (!ppidGet && (strstr(getBuff, "PPid:") != NULL)) {
ppidGet = true;
rcs = sscanf_s(getBuff, "%s %d", paraName, MAX_PATH_LEN, &ppid);
check_sscanf_s_result(rcs, 2);
securec_check_intval(rcs, (void)rcs);
}
if (!stateGet && (strstr(getBuff, "State:") != NULL)) {
stateGet = true;
rcs = sscanf_s(getBuff, "%s %c", paraName, MAX_PATH_LEN, &state, 1);
check_sscanf_s_result(rcs, 2);
securec_check_intval(rcs, (void)rcs);
}
if (!uidGet && (strstr(getBuff, "Uid:") != NULL)) {
uidGet = true;
rcs =
sscanf_s(getBuff, "%s %u %u %u %u", paraName, MAX_PATH_LEN, &uid, &uid1, &uid2, &uid3);
check_sscanf_s_result(rcs, 5);
securec_check_intval(rcs, (void)rcs);
}
if (nameGet && tgidGet && spidGet && ppidGet && stateGet && uidGet) {
break;
}
}
(void)fclose(fp);
* Skip following four kinds of process:
* (1) matched process with specified name;
* (2) this is not om_monitor itself;
* (3) this is the child process of some one;
* (4) this is not current user's process.
*/
if (nameFound && g_myProcPid != spid && tgid == spid && g_myUid == uid) {
if ((parentPid != 0) && (ppid != parentPid) && IsNeedKillProcess(processName, pid, ppid)) {
write_runlog(LOG,
"kill process %s, tgid is %d, pid is %d, ppid:%d is not equal to parentPid:%d \n",
processName,
tgid,
spid,
ppid,
parentPid);
if (kill(spid, SIGKILL) < 0) {
write_runlog(LOG, "failed to kill process (%s:%d)\n", processName, spid);
}
continue;
} else if (stateGet) {
static int persistTTimes[PROCKIND_MAX] = {0};
if (state == 'T' || state == 't') {
persistTTimes[type]++;
if (persistTTimes[type] >= HANG_T_DETECT_MAX_TIMES) {
write_runlog(LOG,
"kill process (%s:%d)"
" due to STOPPED status!\n",
processName,
spid);
if (kill(spid, SIGKILL) < 0) {
write_runlog(LOG, "failed to kill process (%s:%d)\n", processName, spid);
} else {
SetKillProcessValue(isKillProcess);
}
} else if (type == PROCKIND_MONITOR) {
cm_sleep(1);
goto MONITOR_RETRIES;
} else {
write_runlog(LOG,
"Process (%s:%d)'s state is T (TASK_STOPPED"
" or TASK_TRACED), times=%d\n",
processName,
spid,
persistTTimes[type]);
haveFound = true;
}
continue;
} else if (state == 'D' || state == 'd') {
write_runlog(LOG, "Process (%s:%d)'s state is D (TASK_UNINTERRUPTIBLE)\n", processName, spid);
} else if (state == 'Z' || state == 'z') {
write_runlog(LOG, "Process (%s:%d)'s state is Z (TASK_DEAD)\n", processName, spid);
}
persistTTimes[type] = 0;
}
haveFound = true;
}
}
(void)closedir(dir);
if (!isProcessFile) {
write_runlog(LOG, "the process files may not exist in /proc.\n");
return PROCESS_UNKNOWN;
}
return haveFound ? PROCESS_RUNNING : PROCESS_NOT_EXIST;
}
int get_prog_path()
{
char execPath[MAX_PATH_LEN] = {0};
errno_t rc = memset_s(g_cmManualStartPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_etcdManualStartPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
#ifndef ENABLE_MULTIPLE_NODES
rc = memset_s(g_ltranManualStartPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_libnetManualStartFile, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
#endif
rc = memset_s(g_etcdReplacedPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmAgentBinPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_etcdBinPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmStaticConfigChangeFlagFilePath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmStaticConfigurePath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_alarmConfigPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_tlsPath.etcd_ca_path, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_tlsPath.client_crt_path, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_tlsPath.client_key_path, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmUpgradeManualStartPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_cmRollbackManualStartPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_logicClusterListPath, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_monitorLockfile, MAX_PATH_LEN, 0, MAX_PATH_LEN);
securec_check_errno(rc, (void)rc);
int rcs = GetHomePath(execPath, sizeof(execPath));
if (rcs != EOK) {
(void)fprintf(stderr, "Get GAUSSHOME failed, please check.\n");
return -1;
} else {
canonicalize_path(execPath);
rcs = snprintf_s(g_cmManualStartPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/cluster_manual_start", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_etcdManualStartPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/etcd_manual_start", execPath);
securec_check_intval(rcs, (void)rcs);
#ifndef ENABLE_MULTIPLE_NODES
rcs = snprintf_s(g_ltranManualStartPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/ltran_manual_start", execPath);
securec_check_intval(rcs, (void)rcs);
rcs =
snprintf_s(g_libnetManualStartFile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/libnet_manual_start", execPath);
securec_check_intval(rcs, (void)rcs);
#endif
rcs = snprintf_s(g_etcdReplacedPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/etcd_replaced", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_cmAgentBinPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/cm_agent", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_etcdBinPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/etcd", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_cmStaticConfigChangeFlagFilePath,
MAX_PATH_LEN,
MAX_PATH_LEN - 1,
"%s/bin/cluster_dilatation_status",
execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_cmStaticConfigurePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/cluster_static_config", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_alarmConfigPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/alarmConfig.conf", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_tlsPath.etcd_ca_path, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/share/sslcert/etcd/etcdca.crt", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_tlsPath.client_crt_path, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/share/sslcert/etcd/client.crt", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_tlsPath.client_key_path, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/share/sslcert/etcd/client.key", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_cmUpgradeManualStartPath,
MAX_PATH_LEN,
MAX_PATH_LEN - 1,
"%s/bin/cluster_upgrade_manual_start",
execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_cmRollbackManualStartPath,
MAX_PATH_LEN,
MAX_PATH_LEN - 1,
"%s/bin/cluster_rollback_manual_start",
execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(g_noCgroupFlag, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/no_cm_cgroup", execPath);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_logicClusterListPath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/%s", execPath, LOGIC_CLUSTER_LIST);
securec_check_intval(rcs, (void)rcs);
rc = snprintf_s(g_monitorLockfile, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/bin/om_monitor.lock", execPath);
securec_check_intval(rc, (void)rcs);
check_input_for_security(g_monitorLockfile);
canonicalize_path(g_monitorLockfile);
}
return 0;
}
bool CheckOfflineNode()
{
char env[MAX_PATH_LEN] = {0};
if (cmmonitor_getenv("DORADO_REARRANGE", env, sizeof(env)) != EOK) {
write_runlog(LOG, "Line:%d Get DORADO_REARRANGE failed, please check.\n", __LINE__);
return false;
}
if (strcmp(env, "offline") == 0) {
write_runlog(LOG, "Line:%d DORADO_REARRANGE is offline.\n", __LINE__);
return true;
}
return false;
}
void CreateEtcdLogPath()
{
char gausslog[MAXPGPATH] = {0};
errno_t rc = memset_s(gausslog, sizeof(gausslog), 0, MAXPGPATH);
securec_check_errno(rc, (void)rc);
rc = memset_s(g_etcdLogPath, MAXPGPATH, 0, MAXPGPATH);
securec_check_errno(rc, (void)rc);
int rcs = cmmonitor_getenv("GAUSSLOG", gausslog, sizeof(gausslog));
if (rcs != EOK) {
(void)fprintf(stderr, "FATAL The environment variable 'GAUSSLOG' was not specified.\n");
exit(-1);
}
CheckEnvValue(gausslog);
if (CheckOfflineNode()) {
uint32 nodeIndex = 0;
int ret = find_node_index_by_nodeid(g_nodeHeader.node, &nodeIndex);
if (ret != 0) {
write_runlog(ERROR, "create etcd directory get node index failed!\n");
return;
}
rcs = snprintf_s(g_etcdLogPath,
sizeof(g_etcdLogPath),
MAX_PATH_LEN - 1,
"%s/etcdlog/",
g_node[nodeIndex].etcdDataPath);
securec_check_intval(rcs, (void)rcs);
if (mkdir(g_etcdLogPath, S_IRWXU) != 0) {
write_runlog(ERROR, "create directory(%s) failed, errno: %d.\n", g_etcdLogPath, errno);
} else {
write_runlog(LOG, "create etcd directory(%s) successfully.\n", g_etcdLogPath);
}
rcs = strncat_s(g_etcdLogPath, sizeof(g_etcdLogPath), "etcd", strlen("etcd"));
securec_check_errno(rcs, (void)rcs);
} else {
rcs = snprintf_s(g_etcdLogPath, sizeof(g_etcdLogPath), MAX_PATH_LEN - 1, "%s/cm/etcd", gausslog);
securec_check_intval(rcs, (void)rcs);
}
rcs = snprintf_s(g_curEtcdLogFile, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", g_etcdLogPath, "etcd", curLogFileMark);
securec_check_intval(rcs, (void)rcs);
check_input_for_security(g_curEtcdLogFile);
canonicalize_path(g_curEtcdLogFile);
if (mkdir(g_etcdLogPath, S_IRWXU) != 0) {
write_runlog(ERROR, "create etcd directory(%s) failed, errno: %d.\n", g_etcdLogPath, errno);
} else {
write_runlog(LOG, "create etcd directory(%s) successfully.\n", g_etcdLogPath);
}
}
int start_cm_agent(void)
{
pid_t pid;
int status;
bool inUpgrade = false;
bool inRollback = false;
struct stat stat_buf = {0};
pid = fork();
if (pid == 0) {
int fd = open(g_curLogFileName, O_RDWR | O_APPEND | O_CREAT, 0600);
if (fd == -1) {
char errBuffer[ERROR_LIMIT_LEN];
write_runlog(ERROR,
"can not open execl call log file: %s %s\n",
g_curLogFileName,
strerror_r(errno, errBuffer, ERROR_LIMIT_LEN));
} else {
(void)dup2(fd, STDOUT_FILENO);
(void)dup2(fd, STDERR_FILENO);
(void)close(fd);
}
if (stat(g_cmUpgradeManualStartPath, &stat_buf) == 0) {
inUpgrade = true;
}
if (stat(g_cmRollbackManualStartPath, &stat_buf) == 0) {
inRollback = true;
}
if (g_isStart || inUpgrade || inRollback) {
status = execl(g_cmAgentBinPath, g_cmAgentBinPath, "normal", (char *)0);
} else {
status = execl(g_cmAgentBinPath, g_cmAgentBinPath, "abnormal", (char *)0);
}
if (status < 0) {
write_runlog(FATAL, "execl cm_agent faild! path is %s\n", g_cmAgentBinPath);
_exit(1);
}
_exit(1);
}
return (int)pid;
}
#ifdef ENABLE_LLT
extern "C" {
extern void HLLT_Coverage_SaveCoverageData();
}
#endif
static int get_current_timestamp(char *timestamp, size_t len)
{
pg_time_t currentTime = time(NULL);
struct tm *t = localtime(¤tTime);
if (t != NULL) {
(void)strftime(timestamp, len, "%Y-%m-%d_%H%M%S", t);
return 0;
}
return -1;
}
static void CreateEtcdLog()
{
if (access(g_curEtcdLogFile, F_OK) != -1) {
return;
}
char createTime[LEN_TIMESTAMP] = {0};
char buff[LOG_MAX_TIMELEN];
size_t counter;
int rcs;
mode_t oumask;
if (get_current_timestamp(createTime, LEN_TIMESTAMP) != 0) {
write_runlog(ERROR, "create etcd log get timestamp error\n");
return;
}
rcs = snprintf_s(buff, LOG_MAX_TIMELEN, LOG_MAX_TIMELEN - 1, "log_file_create_time=%s\n", createTime);
securec_check_intval(rcs, (void)rcs);
oumask = umask((mode_t)((~(mode_t)(S_IRUSR | S_IWUSR | S_IXUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO)));
FILE *etcdLogFile = fopen(g_curEtcdLogFile, "w+");
(void)umask(oumask);
if (etcdLogFile == NULL) {
write_runlog(ERROR, "create etcd log file failed! errno is %s\n", strerror(errno));
return;
}
counter = fwrite(buff, sizeof(char), LOG_MAX_TIMELEN, etcdLogFile);
write_runlog(LOG, "counter is %lu\n", counter);
(void)fclose(etcdLogFile);
}
static void switch_ETCD_logfile()
{
struct stat statBuf = {0};
if (stat(g_curEtcdLogFile, &statBuf) == 0 && statBuf.st_size > maxLogFileSize) {
char command[MAXPGPATH * 4] = {0};
char hstEtcdLogFile[MAXPGPATH] = {0};
int rcs;
char createTime[LEN_TIMESTAMP] = {0};
if (get_current_timestamp(createTime, LEN_TIMESTAMP) != 0) {
write_runlog(ERROR, "get timestamp error\n");
return;
}
rcs = snprintf_s(hstEtcdLogFile, MAXPGPATH, MAXPGPATH - 1, "%s/%s-%s.log", g_etcdLogPath, "etcd", createTime);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(command,
4 * MAXPGPATH,
(4 * MAXPGPATH) - 1,
"cp %s %s;echo \"log_file_create_time=%s\" > %s;",
g_curEtcdLogFile, hstEtcdLogFile, createTime, g_curEtcdLogFile);
securec_check_intval(rcs, (void)rcs);
rcs = system(command);
if (rcs != 0) {
write_runlog(ERROR,
"failed to switch ETCD logfile. cmd:%s. return:(%d,%d), errno=%d.\n",
command,
rcs,
WEXITSTATUS(rcs),
errno);
} else {
write_runlog(LOG, "switch ETCD logfile successfully. cmd:%s.\n", command);
}
}
}
#ifndef ENABLE_MULTIPLE_NODES
static void KillLtranProcess()
{
char command[MAXPGPATH] = {0};
int ret = snprintf_s(command,
2 * MAXPGPATH,
2 * MAXPGPATH - 1,
SYSTEMQUOTE "killall ltran >> \"%s\" 2>&1 &" SYSTEMQUOTE,
g_curLogFileName);
securec_check_ss_c(ret, "", "");
write_runlog(LOG, "kill ltran! command=%s \n", command);
ret = system(command);
if (ret != 0) {
write_runlog(ERROR, "run system command failed %d! %s, errno=%d.\n", ret, command, errno);
}
}
static void CheckLtranProcessStatus()
{
char command[MAXPGPATH] = {0};
char ltranConfDir[MAX_PATH_LEN] = {0};
int rcs = strcpy_s(ltranConfDir, MAX_PATH_LEN, "/usr/share/libnet/ltran.conf");
securec_check_errno(rcs, (void)rcs);
if (access(g_ltranManualStartPath, 0) != 0) {
if (check_process_status(PROCKIND_ITRAN, 0) == PROCESS_NOT_EXIST) {
write_runlog(DEBUG1, "The result for checking libnet nic is sucessfully\n");
int rc = snprintf_s(command,
MAXPGPATH,
MAXPGPATH - 1,
"ltran --config-file %s >> \"%s\" 2>&1 &",
ltranConfDir,
g_curLogFileName);
securec_check_intval(rc, (void)rc);
write_runlog(LOG, "ltran START system(command:%s)\n", command);
rc = system(command);
if (rc != 0) {
write_runlog(ERROR, "run system command failed %d! %s, errno=%d.\n", rcs, command, errno);
}
}
} else {
if (check_process_status(PROCKIND_ITRAN, 0) == PROCESS_RUNNING) {
KillLtranProcess();
}
}
}
#endif
int server_loop(void)
{
int status;
int pid;
struct stat stat_buf = {0};
char agentPidFile[MAXPGPATH] = {0};
int startCMACount = 0;
int startRetryTimes = 3;
AlarmAdditionalParam tempAdditionalParam;
struct passwd *pw = getpwuid(getuid());
errno_t rc = snprintf_s(agentPidFile, MAXPGPATH, MAXPGPATH - 1, "%s/%s", g_agentConfigPath, CM_AGENT_PID_FILE);
securec_check_intval(rc, (void)rc);
check_input_for_security(agentPidFile);
canonicalize_path(agentPidFile);
if (pw == NULL || pw->pw_name == NULL) {
write_runlog(FATAL, "can not get current user name.\n");
return -1;
}
for (;;) {
pid = waitpid(-1, &status, WNOHANG);
if (pid > 0) {
write_runlog(LOG, "child process cm_agent have die! pid is %d, exit status is %d\n ", pid, status);
if (WIFEXITED(status)) {
write_runlog(LOG, "cm_agent exited, status=%d\n", WEXITSTATUS(status));
} else if (WIFSIGNALED(status)) {
write_runlog(LOG, "cm_agent killed by signal %d\n", WTERMSIG(status));
}
if (pid == g_cmAgentPid) {
if (status != 0) {
g_agentFaultCount++;
g_previousStatus = status;
} else {
g_agentFaultCount = 0;
g_previousStatus = 0;
}
}
delete_lock_file(agentPidFile);
}
if (stat(g_monitorLockfile, &stat_buf) != 0) {
write_runlog(LOG, "The monitor lock file doesn't exist, process exit.\n");
return 1;
}
if (stat(g_cmManualStartPath, &stat_buf) == 0) {
g_isStart = true;
}
status = check_process_status(PROCKIND_CMAGENT, g_myProcPid);
if (status == PROCESS_NOT_EXIST) {
write_runlog(DEBUG5, "child process(%d) cm_agent have exit\n ", g_cmAgentPid);
if (access(g_cmManualStartPath, 0) != 0) {
++startCMACount;
}
if (startCMACount >= startRetryTimes) {
if (g_startupAlarmList != NULL) {
WriteAlarmAdditionalInfo(&tempAdditionalParam,
"cma",
"",
"",
"",
&(g_startupAlarmList[g_startupAlarmListSize - 1]),
ALM_AT_Fault);
AlarmReporter(
&(g_startupAlarmList[g_startupAlarmListSize - 1]), ALM_AT_Fault, &tempAdditionalParam);
}
}
if (check_start_request()) {
g_cmAgentPid = start_cm_agent();
write_runlog(LOG, "cm_agent start, pid is %d\n ", g_cmAgentPid);
if (create_lock_file(agentPidFile, g_agentConfigPath, g_cmAgentPid) != 0) {
write_runlog(WARNING, "failed to create the cm agent pid file.\n");
}
if (startCMACount % 20 == 0) {
status = get_prog_path();
if (status < 0) {
(void)fprintf(stderr, "FATAL get_prog_path failed!\n");
exit(status);
} else {
write_runlog(LOG, "Reload env, agent path is %s.\n", g_cmAgentBinPath);
}
}
if (startCMACount == 100) {
write_runlog(LOG, "Monitor has started agent for 5 minutes, agent path is %s.\n", g_cmAgentBinPath);
char execPath[MAX_PATH_LEN] = {0};
int rcs = GetHomePath(execPath, sizeof(execPath));
if (rcs != EOK) {
(void)fprintf(stderr, "FATAL The environment variable 'GAUSSHOME' was not specified.\n");
exit(-1);
}
write_runlog(LOG, "env is %s.\n", execPath);
}
#ifdef ENABLE_LLT
HLLT_Coverage_SaveCoverageData();
#endif
}
} else if (status == PROCESS_RUNNING) {
startCMACount = 0;
if (g_startupAlarmList != NULL) {
WriteAlarmAdditionalInfo(&tempAdditionalParam,
"cma",
"",
"",
"",
&(g_startupAlarmList[g_startupAlarmListSize - 1]),
ALM_AT_Resume);
AlarmReporter(&(g_startupAlarmList[g_startupAlarmListSize - 1]), ALM_AT_Resume, &tempAdditionalParam);
}
}
check_ETCD_process_status(&tempAdditionalParam, pw->pw_name);
switch_ETCD_logfile();
#ifndef ENABLE_MULTIPLE_NODES
if (stat(g_libnetManualStartFile, &stat_buf) == 0 && g_currentNode->datanodeCount > 0) {
CheckLtranProcessStatus();
}
#endif
if (stat(g_cmManualStartPath, &stat_buf) != 0) {
g_isStart = false;
}
clean_system_alarm_log(system_alarm_log, sys_log_path);
cm_sleep(MONITOR_CHECK_INTERVAL);
}
}
* etcd 3.3.23 support "--log-output", but It has been changed to "--log-outputs" in etcd 3.5.0.
* monitor must all support etcd 3.3.23 and etcd 3.5.0 command when upgrade cluster.
*/
int GetEtcdLogOutputCmd(char *logOutPutCmd, uint32 len)
{
char command[MAXPGPATH * 2] = {0};
int rcs;
int ret;
rcs = snprintf_s(
command, sizeof(command), sizeof(command) - 1, "%s --help | grep \"\\-\\-log-outputs\"", g_etcdBinPath);
securec_check_intval(rcs, (void)rcs);
ret = system(command);
if (ret == 0) {
write_runlog(LOG, "run check etcd log-outputs command: %s success\n", command);
rcs = strcpy_s(logOutPutCmd, len, "--log-outputs");
securec_check_intval(rcs, (void)rcs);
return 0;
}
write_runlog(LOG, "run check etcd log-outputs command %s failed,try to check log-output command!\n", command);
rcs = snprintf_s(
command, sizeof(command), sizeof(command) - 1, "%s --help | grep \"\\-\\-log-output\"", g_etcdBinPath);
securec_check_intval(rcs, (void)rcs);
ret = system(command);
if (ret == 0) {
write_runlog(LOG, "run check etcd log-output command: %s success\n", command);
rcs = strcpy_s(logOutPutCmd, len, "--log-output");
securec_check_intval(rcs, (void)rcs);
return 0;
}
write_runlog(WARNING, "run check etcd log-output command %s failed %d!\n", command, ret);
return -1;
}
void CheckStartEtcdCount(AlarmAdditionalParam *additionalParam)
{
int status = 0;
int rcs;
if (g_startEtcdCount >= 3) {
if (g_startupAlarmList != NULL) {
WriteAlarmAdditionalInfo(additionalParam, "etcd", "", "", "", &(g_startupAlarmList[0]), ALM_AT_Fault);
AlarmReporter(&(g_startupAlarmList[0]), ALM_AT_Fault, additionalParam);
}
}
if (g_startEtcdCount % 20 == 0) {
status = get_prog_path();
if (status < 0) {
(void)fprintf(stderr, "FATAL get_prog_path failed!\n");
exit(status);
} else {
write_runlog(LOG, "Reload env, ETCD path is %s.\n", g_etcdBinPath);
}
}
if (g_startEtcdCount == 100) {
write_runlog(LOG, "Monitor has started ETCD for 5 minutes, ETCD path is %s.\n", g_etcdBinPath);
char execPath[MAX_PATH_LEN] = {0};
rcs = GetHomePath(execPath, sizeof(execPath));
if (rcs != EOK) {
(void)fprintf(stderr, "FATAL The environment variable 'GAUSSHOME' was not specified.\n");
exit(-1);
}
write_runlog(LOG, "env is %s.\n", execPath);
}
}
static void check_ETCD_process_status(AlarmAdditionalParam *additionalParam, const char *userName)
{
int status = 0;
if (g_currentNode->etcd) {
char command[MAXPGPATH * 2] = {0};
int rcs;
int ret;
* Remember how many times we have killed etcd. Once MONITOR_CHECK_INTERVAL*90
* seconds have passed, try to kill etcd forcely while it is still alive.
*/
static int haveKillEtcdCount = 0;
static const int forceKillEtcd = 90;
if (access(g_etcdManualStartPath, 0) != 0) {
status = check_process_status(PROCKIND_ETCD, 0);
if (status == PROCESS_NOT_EXIST) {
uint32 currNodeIndex = 0;
ret = find_node_index_by_nodeid(g_nodeHeader.node, &currNodeIndex);
if (ret != 0) {
write_runlog(ERROR, "check ETCD process get node index failed!\n");
return;
}
char logOutPutCmd[MAXPGPATH] = {0};
if (GetEtcdLogOutputCmd(logOutPutCmd, MAXPGPATH) != 0) {
write_runlog(
ERROR, "get etcd log-output command failed, please check etcd bin file %s!\n", g_etcdBinPath);
++g_startEtcdCount;
CheckStartEtcdCount(additionalParam);
return;
}
char clientUrls[CM_IP_LENGTH * CM_IP_NUM] = {0};
for (uint32 ipnum = 0; ipnum < CM_IP_NUM; ipnum++) {
if (strlen(g_node[currNodeIndex].etcdClientListenIPs[ipnum]) == 0) {
break;
}
char single_url[CM_IP_LENGTH] = {0};
rcs = snprintf_s(single_url, CM_IP_LENGTH, CM_IP_LENGTH - 1,
SYSTEMQUOTE "https://%s:%u" SYSTEMQUOTE,
g_node[currNodeIndex].etcdClientListenIPs[ipnum], g_node[currNodeIndex].etcdClientListenPort);
securec_check_intval(rcs, (void)rcs);
if ((ipnum + 1) < g_node[currNodeIndex].etcdClientListenIPCount) {
rcs = strncat_s(single_url, CM_IP_LENGTH, ",", strlen(","));
securec_check_errno(rcs, (void)rcs);
}
rcs = strncat_s(clientUrls, CM_IP_LENGTH * CM_IP_NUM, single_url, strlen(single_url));
securec_check_errno(rcs, (void)rcs);
}
rcs = snprintf_s(command,
2 * MAXPGPATH,
(2 * MAXPGPATH) - 1,
SYSTEMQUOTE "umask=`umask`;umask 0077;%s -name %s --data-dir %s "
"--client-cert-auth --trusted-ca-file %s --cert-file %s/etcd.crt --key-file %s/etcd.key "
"--peer-client-cert-auth --peer-trusted-ca-file %s --peer-cert-file %s/etcd.crt --peer-key-file "
"%s/etcd.key "
"-initial-advertise-peer-urls https://%s:%u -listen-peer-urls https://%s:%u "
"-listen-client-urls %s -advertise-client-urls %s --election-timeout 5000 "
"--heartbeat-interval 1000 %s 'stdout' --quota-backend-bytes $((8*1024*1024*1024)) "
"--auto-compaction-mode 'periodic' --auto-compaction-retention '1h' "
"-initial-cluster-token etcd-cluster-%s --enable-v2=false -initial-cluster " SYSTEMQUOTE,
g_etcdBinPath,
g_node[currNodeIndex].etcdName,
g_node[currNodeIndex].etcdDataPath,
g_tlsPath.etcd_ca_path,
g_node[currNodeIndex].etcdDataPath,
g_node[currNodeIndex].etcdDataPath,
g_tlsPath.etcd_ca_path,
g_node[currNodeIndex].etcdDataPath,
g_node[currNodeIndex].etcdDataPath,
g_node[currNodeIndex].etcdHAListenIPs[0],
g_node[currNodeIndex].etcdHAListenPort,
g_node[currNodeIndex].etcdHAListenIPs[0],
g_node[currNodeIndex].etcdHAListenPort,
clientUrls,
clientUrls,
logOutPutCmd,
userName);
securec_check_intval(rcs, (void)rcs);
uint32 j = 0;
for (uint32 i = 0; i < g_node_num; i++) {
if (g_node[i].etcd) {
char port[MAX_PORT_LEN];
if (j++ > 0) {
rcs = strncat_s(command, 2 * MAXPGPATH, ",", strlen(","));
securec_check_errno(rcs, (void)rcs);
}
rcs = strncat_s(command, 2 * MAXPGPATH, g_node[i].etcdName, strlen(g_node[i].etcdName));
securec_check_errno(rcs, (void)rcs);
rcs = strncat_s(command, 2 * MAXPGPATH, "=https://", strlen("=https://"));
securec_check_errno(rcs, (void)rcs);
rcs = strncat_s(
command, 2 * MAXPGPATH, g_node[i].etcdHAListenIPs[0], strlen(g_node[i].etcdHAListenIPs[0]));
securec_check_errno(rcs, (void)rcs);
rcs = strncat_s(command, 2 * MAXPGPATH, ":", strlen(":"));
securec_check_errno(rcs, (void)rcs);
rcs = snprintf_s(port, MAX_PORT_LEN, MAX_PORT_LEN - 1, "%u", g_node[i].etcdHAListenPort);
securec_check_intval(rcs, (void)rcs);
rcs = strncat_s(command, 2 * MAXPGPATH, port, strlen(port));
securec_check_errno(rcs, (void)rcs);
}
}
* the replaced ETCD node must be started with flag "-initial-cluster-state existing" so that
* new node can sync data from other members.
*/
if (access(g_etcdReplacedPath, 0) != 0) {
rcs = strncat_s(command,
2 * MAXPGPATH,
" -initial-cluster-state new >> \"",
strlen(" -initial-cluster-state new >> \""));
securec_check_errno(rcs, (void)rcs);
} else {
rcs = strncat_s(command,
2 * MAXPGPATH,
" -initial-cluster-state existing >> \"",
strlen(" -initial-cluster-state existing >> \""));
securec_check_errno(rcs, (void)rcs);
ret = unlink(g_etcdReplacedPath);
if (ret != 0) {
write_runlog(ERROR, "could not remove etcd_replaced file: %d.\n", errno);
}
}
rcs = strncat_s(command, 2 * MAXPGPATH, g_curEtcdLogFile, strlen(g_curEtcdLogFile));
securec_check_errno(rcs, (void)rcs);
rcs = strncat_s(command, 2 * MAXPGPATH, "\" 2>&1 & umask $umask", strlen("\" 2>&1 & umask $umask"));
securec_check_errno(rcs, (void)rcs);
ret = system(command);
write_runlog(LOG, "run etcd command: %s \n", command);
if (ret != 0) {
write_runlog(ERROR, "run system command failed %d! %s, errno=%d.\n", ret, command, errno);
}
g_isAttachToCgroup = false;
haveKillEtcdCount = 0;
++g_startEtcdCount;
CheckStartEtcdCount(additionalParam);
} else if (status == PROCESS_RUNNING) {
g_startEtcdCount = 0;
if (g_startupAlarmList != NULL) {
WriteAlarmAdditionalInfo(additionalParam, "etcd", "", "", "", &(g_startupAlarmList[0]),
ALM_AT_Resume);
AlarmReporter(&(g_startupAlarmList[0]), ALM_AT_Resume, additionalParam);
}
#ifdef ENABLE_MULTIPLE_NODES
struct stat stat_buf1 = {0};
int retStat = stat(g_noCgroupFlag, &stat_buf1);
if (!g_isAttachToCgroup && retStat != 0) {
char buf[64] = {0};
int etcd_pid = 0;
rcs = snprintf_s(command,
2 * MAXPGPATH,
2 * MAXPGPATH - 1,
"status=`ps x|grep '%s' |grep -v 'grep' | awk '{print $1}'`;echo \"$status\"",
g_etcdBinPath);
securec_check_intval(rcs, (void)rcs);
write_runlog(LOG, "get etcd processid. command=%s.\n", command);
FILE *fp = popen(command, "r");
if (fp == NULL) {
write_runlog(ERROR, "get etcd processid failed. command=%s.\n", command);
return;
} else {
if (fgets(buf, sizeof(buf), fp) != NULL) {
write_runlog(LOG, "etcd processid is %s.\n", buf);
etcd_pid = (int)strtol(buf, NULL, 10);
char *cmcgroup_relpath = gscgroup_cm_init();
if (cmcgroup_relpath != NULL) {
gscgroup_cm_attach_task_pid(cmcgroup_relpath, etcd_pid);
free(cmcgroup_relpath);
}
g_isAttachToCgroup = true;
}
(void)pclose(fp);
}
}
#endif
}
} else {
if (check_process_status(PROCKIND_ETCD, 0) == PROCESS_RUNNING) {
if (access(g_etcdReplacedPath, 0) == 0) {
g_replaceEtcdCount++;
} else {
g_replaceEtcdCount = 0;
}
haveKillEtcdCount++;
if (g_replaceEtcdCount > TRY_COUNT_FOR_KILL_ETCD_REPLACE || g_replaceEtcdCount == 0) {
g_replaceEtcdCount = 0;
rcs = snprintf_s(command,
2 * MAXPGPATH,
2 * MAXPGPATH - 1,
SYSTEMQUOTE "killall %s etcd" SYSTEMQUOTE,
(haveKillEtcdCount >= forceKillEtcd) ? "-s 9" : "");
securec_check_intval(rcs, (void)rcs);
struct stat statBuf = {0};
if (stat(g_curLogFileName, &statBuf) == 0) {
rcs = strncat_s(command, 2 * MAXPGPATH, " >> ", strlen(" >> "));
securec_check_errno(rcs, (void)rcs);
rcs = strncat_s(command, 2 * MAXPGPATH, g_curLogFileName, strlen(g_curLogFileName));
securec_check_errno(rcs, (void)rcs);
rcs = strncat_s(command, 2 * MAXPGPATH, " 2>&1", strlen(" 2>&1"));
securec_check_errno(rcs, (void)rcs);
}
write_runlog(LOG, "kill etcd! command=%s \n", command);
ret = system(command);
if (ret != 0) {
write_runlog(ERROR, "run system command failed %d! %s, errno=%d.\n", ret, command, errno);
}
}
}
}
}
}
* @Description: get all cgroup sub system's mount points.
* @IN void
* @Return: 0: normal -1: abnormal
* @See also:
*/
#define MOUNT_SUBSYS_KINDS (2)
#define MOUNT_CPU_NAME "cpu"
#define MOUNT_CPUSET_NAME "cpuset"
#define GSCGROUP_TOP_DATABASE "Gaussdb"
#define GSCGROUP_TOP_CLASS "Class"
static char g_mpoints[MOUNT_SUBSYS_KINDS][MAXPGPATH];
int OmGetMountPoints(void)
{
struct mntent *ent;
char mntentBuffer[2 * FILENAME_MAX];
struct mntent tempEnt;
int i;
const char *subsysTable[] = {MOUNT_CPU_NAME, MOUNT_CPUSET_NAME};
errno_t rc;
rc = memset_s(g_mpoints, MOUNT_SUBSYS_KINDS * MAXPGPATH, 0, MOUNT_SUBSYS_KINDS * MAXPGPATH);
securec_check_errno(rc, (void)rc);
FILE *procMount = fopen("/proc/mounts", "re");
if (procMount == NULL) {
return -1;
}
while ((ent = getmntent_r(procMount, &tempEnt, mntentBuffer, sizeof(mntentBuffer))) != NULL) {
if (strcmp(ent->mnt_type, "cgroup") != 0) {
continue;
}
for (i = 0; i < MOUNT_SUBSYS_KINDS; ++i) {
if (hasmntopt(ent, subsysTable[i]) == NULL) {
continue;
}
rc = snprintf_s(g_mpoints[i], MAXPGPATH, MAXPGPATH - 1, "%s", ent->mnt_dir);
securec_check_intval(rc, (void)fclose(procMount));
}
}
(void)fclose(procMount);
return 0;
}
void CheckCgroupInstallation(void)
{
int ret = 0;
errno_t rc;
struct passwd *passwdUser = getpwuid(geteuid());
if (passwdUser == NULL) {
write_runlog(ERROR,
"can't get the passwdUser. "
"HINT: please check the running user!\n");
return;
}
rc = memset_s(g_mpoints, MOUNT_SUBSYS_KINDS * MAXPGPATH, 0, MOUNT_SUBSYS_KINDS * MAXPGPATH);
securec_check_errno(rc, (void)rc);
if (*g_mpoints[0] == '\0' || *g_mpoints[1] == '\0') {
ret = OmGetMountPoints();
if (ret == -1 || *g_mpoints[0] == '\0' || *g_mpoints[1] == '\0') {
write_runlog(ERROR,
"can't get the mount points\n"
"Please check if cgroup has been mounted and user's cgroup data has been created!\n");
return;
}
}
if (*g_mpoints[0] && *g_mpoints[1]) {
struct stat buf = {0};
char cgpath[MAXPGPATH];
ret = 0;
for (int i = 0; i < MOUNT_SUBSYS_KINDS && ret == 0; ++i) {
rc = snprintf_s(
cgpath, MAXPGPATH, MAXPGPATH - 1, "%s/%s:%s", g_mpoints[i], GSCGROUP_TOP_DATABASE, passwdUser->pw_name);
securec_check_intval(rc, (void)rc);
ret = stat(cgpath, &buf);
if (ret == 0) {
rc = snprintf_s(cgpath,
MAXPGPATH,
MAXPGPATH - 1,
"%s/%s:%s/%s",
g_mpoints[i],
GSCGROUP_TOP_DATABASE,
passwdUser->pw_name,
GSCGROUP_TOP_CLASS);
securec_check_intval(rc, (void)rc);
ret = stat(cgpath, &buf);
if (ret != 0) {
write_runlog(ERROR,
"can't get the %s,\n"
"Please check if cgroup has been mounted and user's cgroup data has been created!\n",
cgpath);
}
} else {
write_runlog(ERROR,
"can't get the %s,\n"
"Please check if cgroup has been mounted and user's cgroup data has been created!\n",
cgpath);
}
}
}
}
static void DoAdvice(void)
{
(void)fprintf(stderr, "Try \"%s --help\" for more information.\n", g_progname);
}
static void DoHelp(void)
{
(void)printf(_("%s is a utility to start an agent or a WMP.\n\n"), g_progname);
(void)printf(_("Usage:\n"));
(void)printf(_(" %s 0 -L FILENAME\n"), g_progname);
(void)printf(_(" %s 1\n"), g_progname);
(void)printf(_(" %s 2 -L FILENAME\n"), g_progname);
(void)printf(_(" %s 3\n"), g_progname);
(void)printf(_(" %s -L FILENAME\n"), g_progname);
(void)printf(_("\nCommon options:\n"));
(void)printf(_(" -?, --help show this help, then exit\n"));
(void)printf(_(" -V, --version output version information, then exit\n"));
}
void InitLogFiles()
{
int rcs = 0;
errno_t rc = memset_s(g_curLogFileName, MAXPGPATH, 0, MAXPGPATH);
securec_check_errno(rc, (void)rc);
rc = memset_s(sys_log_path, MAXPGPATH, 0, MAXPGPATH);
securec_check_errno(rc, (void)rc);
canonicalize_path(g_alarmConfigPath);
GetAlarmConfig(g_alarmConfigPath);
if (g_logFileSet) {
(void)logfile_init();
rcs = snprintf_s(
g_curLogFileName, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s.log", g_logFile, g_progname, curLogFileMark);
securec_check_intval(rcs, (void)rcs);
rc = strncpy_s(sys_log_path, MAX_PATH_LEN, g_logFile, strlen(g_logFile));
securec_check_errno(rc, (void)rc);
} else {
char exec_path[MAX_PATH_LEN] = {0};
rcs = GetHomePath(exec_path, sizeof(exec_path));
if (rcs != EOK) {
(void)fprintf(stderr, "FATAL Get GAUSSHOME failed, please check.\n");
exit(-1);
} else {
rcs = snprintf_s(sys_log_path, sizeof(sys_log_path), MAX_PATH_LEN - 1, "%s/bin", exec_path);
securec_check_intval(rcs, (void)rcs);
rcs = snprintf_s(
g_curLogFileName, MAXPGPATH, MAXPGPATH - 1, "%s/%s-%s.log", sys_log_path, g_progname, curLogFileMark);
securec_check_intval(rcs, (void)rcs);
}
}
check_input_for_security(g_curLogFileName);
canonicalize_path(g_curLogFileName);
(void)mkdir(sys_log_path, S_IRWXU);
syslogFile = logfile_open(sys_log_path, "a");
if (syslogFile == NULL) {
(void)printf("monitor_main,open log file failed\n");
}
}
static void CheckDirExist()
{
char gausslog[MAXPGPATH] = {0};
char cmlog[MAXPGPATH] = {0};
char monitorlog[MAXPGPATH] = {0};
char serverlog[MAXPGPATH] = {0};
char agentlog[MAXPGPATH] = {0};
int rcs = cmmonitor_getenv("GAUSSLOG", gausslog, sizeof(gausslog));
if (rcs != EOK) {
(void)fprintf(stderr, "FATAL The environment variable 'GAUSSLOG' was not specified.\n");
exit(-1);
}
if (access(gausslog, F_OK) != 0) {
write_runlog(ERROR, "FATAL access %s return error %d \n", gausslog, errno);
exit(-1);
}
rcs = snprintf_s(cmlog, MAXPGPATH, MAXPGPATH - 1, "%s/cm", gausslog);
securec_check_intval(rcs, (void)rcs);
if (access(cmlog, F_OK) != 0) {
rcs = mkdir(cmlog, S_IRWXU);
if (rcs != 0) {
write_runlog(ERROR, "FATAL mkdir %s return error %d \n", cmlog, errno);
exit(-1);
}
rcs = snprintf_s(monitorlog, MAXPGPATH, MAXPGPATH - 1, "%s/om_monitor", cmlog);
securec_check_intval(rcs, (void)rcs);
rcs = mkdir(monitorlog, S_IRWXU);
if (rcs != 0) {
write_runlog(ERROR, "FATAL mkdir %s return error %d \n", monitorlog, errno);
exit(-1);
}
rcs = snprintf_s(serverlog, MAXPGPATH, MAXPGPATH - 1, "%s/cm_server", cmlog);
securec_check_intval(rcs, (void)rcs);
rcs = mkdir(serverlog, S_IRWXU);
if (rcs != 0) {
write_runlog(ERROR, "FATAL mkdir %s return error %d \n", serverlog, errno);
exit(-1);
}
rcs = snprintf_s(agentlog, MAXPGPATH, MAXPGPATH - 1, "%s/cm_agent", cmlog);
securec_check_intval(rcs, (void)rcs);
rcs = mkdir(agentlog, S_IRWXU);
if (rcs != 0) {
write_runlog(ERROR, "FATAL mkdir %s return error %d \n", agentlog, errno);
exit(-1);
}
}
return;
}
int main(int argc, char **argv)
{
#define LIMIT_OPEN_FILE 640000
static struct option longOptions[] = {{"location", required_argument, NULL, 'L'}, {NULL, 0, NULL, 0}};
int option_index;
int c;
int status;
int err_no = 0;
errno_t rc = 0;
struct rlimit r = {0};
g_myProcPid = getpid();
g_myUid = getuid();
g_progname = "om_monitor";
prefix_name = g_progname;
if (g_myUid == 0) {
(void)printf("current user is the root user (uid = 0), exit.\n");
return 1;
}
(void)syscalllockInit(&g_cmEnvLock);
GetCmdlineOpt(argc, argv);
if (argc > 1) {
if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) {
DoHelp();
_exit(0);
} else if (strcmp(argv[1], "-V") == 0 || strcmp(argv[1], "--version") == 0) {
(void)puts("om_monitor " DEF_CM_VERSION);
_exit(0);
}
}
optind = 1;
while (optind < argc) {
while ((c = getopt_long(argc, argv, "L:", longOptions, &option_index)) != -1) {
if (c == 'L') {
g_logFileSet = true;
FREE_AND_RESET(g_logFile);
g_logFile = strdup(optarg);
if (g_logFile == NULL) {
(void)fprintf(stderr, "%s: -L file is needed.\n", g_progname);
DoAdvice();
_exit(1);
}
check_input_for_security(g_logFile);
break;
} else {
DoAdvice();
_exit(1);
}
}
optind++;
}
if (g_logFileSet && (log_destion_choice == LOG_DESTION_SYSLOG || log_destion_choice == LOG_DESTION_DEV_NULL)) {
(void)fprintf(stderr, "%s: -L option is not needed.\n", g_progname);
DoAdvice();
_exit(1);
}
status = get_prog_path();
if (status < 0) {
(void)fprintf(stderr, "get_prog_path failed!\n");
_exit(status);
}
InitLogFiles();
bool isKillProcess = false;
status = check_process_status(PROCKIND_MONITOR, 0, &isKillProcess);
if (status == PROCESS_RUNNING) {
write_runlog(DEBUG5, "monitor exit\n");
_exit(0);
}
if (MonitorLock(isKillProcess) == -1) {
write_runlog(DEBUG1, "Another om_monitor command is still running, start failed !\n");
_exit(-1);
}
CheckDirExist();
print_environ();
if (getrlimit(RLIMIT_NOFILE, &r) == 0) {
if (r.rlim_cur < LIMIT_OPEN_FILE) {
write_runlog(FATAL,
"max number of open files limit %lu less than %d, monitor start failed.\n",
r.rlim_cur,
LIMIT_OPEN_FILE);
_exit(1);
}
} else {
write_runlog(FATAL, "failed to getrlimit number of files\n");
_exit(1);
}
init_signal_mask();
(void)sigprocmask(SIG_SETMASK, &block_sig, NULL);
status = read_config_file(g_cmStaticConfigurePath, &err_no);
char errBuffer[ERROR_LIMIT_LEN] = {0};
switch (status) {
case OPEN_FILE_ERROR: {
write_runlog(FATAL,
"%s: could not open the static config file \"%s\": %s\n",
g_progname,
g_cmStaticConfigurePath,
strerror_r(err_no, errBuffer, ERROR_LIMIT_LEN));
_exit(1);
}
case READ_FILE_ERROR: {
write_runlog(FATAL,
"%s: could not read static config file \"%s\": %s\n",
g_progname,
g_cmStaticConfigurePath,
strerror_r(err_no, errBuffer, ERROR_LIMIT_LEN));
_exit(1);
}
case OUT_OF_MEMORY:
write_runlog(FATAL, "%s: out of memory\n", g_progname);
_exit(1);
default:
break;
}
if (access(g_logicClusterListPath, F_OK) == 0) {
status = read_logic_cluster_config_files(g_logicClusterListPath, &err_no);
char errBuf[ERROR_LIMIT_LEN] = {0};
switch (status) {
case OPEN_FILE_ERROR: {
write_runlog(FATAL,
"%s: could not open logic cluster static config files: %s\n",
g_progname,
strerror_r(err_no, errBuf, ERROR_LIMIT_LEN));
_exit(1);
}
case READ_FILE_ERROR: {
char errBuff[ERROR_LIMIT_LEN];
write_runlog(FATAL,
"%s: could not read logic cluster static config files: %s\n",
g_progname,
strerror_r(err_no, errBuff, ERROR_LIMIT_LEN));
_exit(1);
}
case OUT_OF_MEMORY:
write_runlog(FATAL, "%s: out of memory\n", g_progname);
_exit(1);
default:
break;
}
}
max_logic_cluster_name_len = (max_logic_cluster_name_len < strlen("logiccluster_name"))
? (uint32)strlen("logiccluster_name")
: max_logic_cluster_name_len;
int ret = find_current_node_by_nodeid();
if (ret != 0) {
write_runlog(FATAL, "find_current_node_by_nodeid failed, nodeId=%u.\n", g_nodeHeader.node);
_exit(1);
}
rc = memset_s(g_agentConfigPath, MAXPGPATH, 0, MAXPGPATH);
securec_check_errno(rc, (void)rc);
rc = snprintf_s(g_agentConfigPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s", g_currentNode->cmDataPath, CM_AGENT_BIN_NAME);
securec_check_intval(rc, (void)rc);
#ifdef ENABLE_MULTIPLE_NODES
CheckCgroupInstallation();
#endif
AlarmEnvInitialize();
StartupAlarmItemInitialize(g_currentNode);
#ifdef ENABLE_MULTIPLE_NODES
struct stat stat_buf = {0};
int retStat = stat(g_noCgroupFlag, &stat_buf);
if (retStat != 0) {
write_runlog(LOG, "om_monitor gscgroup_cm_attach_task.\n");
char *cmcgroup_relpath = gscgroup_cm_init();
if (cmcgroup_relpath != NULL) {
gscgroup_cm_attach_task(cmcgroup_relpath);
free(cmcgroup_relpath);
}
}
#endif
create_system_alarm_log(sys_log_path);
if (g_currentNode->etcd) {
CreateEtcdLogPath();
CreateEtcdLog();
}
status = server_loop();
(void)MonitorUnlock();
_exit(status);
}
* @brief
* Check whether the CM Agent meets the startup conditions.
*
* 1. Normal Scenario:
* The cluster manual start file does not exist.
* The CM Agent config file can be read by current user.
* The binary file can be execute by current user.
* The config change file "cluster_dilatation_status" does not exist.
*
* 1. Abnormal Exit Scenario:
* The cluster manual start file exists.
* The CM Agent config file can be read by current user.
* The binary file can be execute by current user.
* The config change file "cluster_dilatation_status" does not exist.
* The CM Agent instance exits abnormally at last time.
*
* @return
* Return whether the CM Agent meets the startup conditions.
*/
static bool check_start_request()
{
bool startRequest = false;
static int agentRestartCount = 0;
char agentConfigFile[MAXPGPATH] = {0};
int retryTimes = 3;
int rc = snprintf_s(agentConfigFile, MAXPGPATH, MAXPGPATH - 1, "%s/%s", g_agentConfigPath, CM_AGENT_CONFIG);
securec_check_intval(rc, (void)rc);
const bool disableManualStart = (access(g_cmManualStartPath, F_OK) == 0);
const bool isConfigExist = (access(agentConfigFile, R_OK) == 0);
const bool isBinaryExist = (access(g_cmAgentBinPath, X_OK) == 0);
const bool isConfigChange = (access(g_cmStaticConfigChangeFlagFilePath, F_OK) == 0);
if (!disableManualStart && isBinaryExist && isConfigExist && !isConfigChange) {
startRequest = true;
agentRestartCount = 0;
}
* Need to restart the CM Agent even if the cluster manual start file exists.
*/
if (disableManualStart && isBinaryExist && isConfigExist && !isConfigChange && g_previousStatus != 0 &&
agentRestartCount < retryTimes) {
write_runlog(LOG,
"The CM Agent did not exit correctly last time. Restart the CM Agent"
" to complete the unfinished stop operation: start_times=%d.\n",
g_agentFaultCount);
startRequest = true;
agentRestartCount++;
}
write_runlog(LOG,
"The CM Agent startup check is complete: cluster_manual_start=%d,"
" agent_config_file_r=%d, agent_binary_file_x=%d, config_change_flag=%d, previous_status=%d,"
" start_count=%d.\n",
disableManualStart,
isConfigExist,
isBinaryExist,
isConfigChange,
g_previousStatus,
g_agentFaultCount);
return startRequest;
}
static int MonitorLock(bool isKillProcess)
{
struct stat statbuf = {0};
if (stat(g_monitorLockfile, &statbuf) != 0) {
char content[MAX_PATH_LEN] = {0};
g_lockfile = fopen(g_monitorLockfile, PG_BINARY_W);
if (g_lockfile == NULL) {
write_runlog(
DEBUG1, "%s: can't open lock file \"%s\" : %s\n", g_progname, g_monitorLockfile, strerror(errno));
exit(1);
} else {
if (fwrite(content, MAX_PATH_LEN, 1, g_lockfile) != 1) {
(void)fclose(g_lockfile);
g_lockfile = NULL;
write_runlog(
DEBUG1, "%s: can't write lock file \"%s\" : %s\n", g_progname, g_monitorLockfile, strerror(errno));
exit(1);
}
(void)fclose(g_lockfile);
(void)chmod(g_monitorLockfile, 0600);
g_lockfile = NULL;
}
}
if ((g_lockfile = fopen(g_monitorLockfile, PG_BINARY_W)) == NULL) {
write_runlog(
DEBUG1, "%s: could not open lock file \"%s\" : %s\n", g_progname, g_monitorLockfile, strerror(errno));
exit(1);
}
if (SetFdCloseExecFlag(g_lockfile) < 0) {
write_runlog(DEBUG1, "%s: can't set file flag\"%s\" : %s\n", g_progname, g_monitorLockfile, strerror(errno));
}
const int32 tryTotalTime = 10;
int32 tryTime = tryTotalTime;
int32 ret;
do {
ret = flock(fileno(g_lockfile), LOCK_EX | LOCK_NB);
if (ret == 0 || !isKillProcess) {
break;
}
--tryTime;
cm_sleep(1);
} while (tryTime > 0);
return ret;
}
static int MonitorUnlock(void)
{
int ret = flock(fileno(g_lockfile), LOCK_UN);
if (g_lockfile != NULL) {
(void)fclose(g_lockfile);
g_lockfile = NULL;
}
return ret;
}