* Copyright (c) 2020 Huawei Technologies Co.,Ltd.
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* ---------------------------------------------------------------------------------------
*
* gscgroup.h
* header file to export the functions to use cgroup
*
* IDENTIFICATION
* src/include/workload/gscgroup.h
*
* ---------------------------------------------------------------------------------------
*/
#ifndef __GS_CGROUPS__
#define __GS_CGROUPS__
#ifndef gettid
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <unistd.h>
#include <sys/syscall.h>
#include "c.h"
#include "gs_threadlocal.h"
#include "datatype/timestamp.h"
#define gettid() (pid_t) syscall(__NR_gettid)
#endif
#include "securec.h"
#include "securec_check.h"
#define securec_check_berrno(errno, express, retval) \
{ \
if (errno != EOK) { \
express; \
return retval; \
} \
}
#define check_errno(errno, express, retval, file, line) \
{ \
if (EOK != errno) { \
fprintf(stderr, \
"%s:%d failed on calling " \
"security function.\n", \
file, \
line); \
express; \
return retval; \
} \
}
#define securec_check_errno(errno, express, retval) check_errno(errno, express, retval, __FILE__, __LINE__)
#define check_intval(errno, express, retval, file, line) \
{ \
if (errno == -1) { \
fprintf(stderr, \
"%s:%d failed on calling " \
"security function.\n", \
file, \
line); \
express; \
return retval; \
} \
}
#define securec_check_intval(val, express, retval) check_intval(val, express, retval, __FILE__, __LINE__)
* Cgroups is divided into 4 purposes:
* Backend Cgroup: used to control the backend threads
* Class Cgroup: used to specify the resource percentage for classes
* DefaultWD Cgroup: used to control the default workload threads
* Timeshare Cgroup: used to control the timeshare threads
*
* Usually, Gaussdb cgroup has 80% dynamic resource of the whole system,
* but it can't beyond the 95% hard limitation.
*
* The resource Ratio for Class and Backend will be 1:2 as default.
*
* Each class can be assigned the specified resource percentage from
* Class cgroup based on requirement.
*
* Under each class cgroup, the default workload cgroup can be assigned the
* specified resource percentage; and its maximum level will be 5 as default.
*
* Each Class cgroup has the timeshare cgroups;
* The resource for timeshare is the remained resource of the class;
* So it must make sure that all resource for default workload is not the total of user.
*
* The logic graph of cgroup is deployed.
* a. /dev/cgroups is the mount point of cgroups
* b. Gaussdb is the top cgroup of Gauss Database
* c. Backend is the cgroup of backend threads
* e. Class is the top cgroup of users' query threads
* f. The name rule for default workload cgroups should be made up with "name:level"
* h. The default Class and default workload cgroup are provided
* i. At most, there are 6 levels for each user (4 levels for defaultwd and 2 level for timeshare)
* j. There are only 2 cgroups for default workload in each level
* f. Rush/High/Medium/Low is the timeshare cgroup in the bottom line of each tree path
/dev/cgroups/ ------- mount point
|
tasks (non-gaussdb threads) Gaussdb/ ------- database topdir
|
Backend/ tasks(no threads) Class/ ------- top class dir
| |
Vacuum/ ... tasks(backend threads) DefaultClass/ Class1/ ... --- class dir
| |
tasks(vacuum thread) DefaultWD:1/ RemainWD:1/ tasks(no threads) --- defaultwd dir (Level 1)
| |
tasks (query) DefaultWD:2/ RemainWD:2/ tasks(no threads) --- defaultwd dir (Level 2)
|
Timeshare:3/ tasks(no threads) --- top timeshare dir (Level 3)
|
Rush:4/ High:4/ Medium:4/ Low:4/ tasks(no threads) --- timeshare dir (Level 4)
|
tasks(query threads)
* Default values:
* 0. The top level cgroup number is 4: mountpoint, Gaussdb, Backend and Class;
* 1. The maximum number for backend cgroup is 16;
* 2. The maximum number for class cgroup is 64;
* 3. The maximum number for defaultwd and timeshare cgroup is 256 (64 * 4 = all timeshare cgroups);
* If user want to create one defaultwd cgroup, but there is no slot, it will fail.
* 4.
*/
#define MOUNT_SUBSYS_KINDS 5
#define MOUNT_CPU_ID 0
#define MOUNT_CPUACCT_ID 1
#define MOUNT_BLKIO_ID 2
#define MOUNT_CPUSET_ID 3
#define MOUNT_MEMORY_ID 4
#define MOUNT_CPU_NAME "cpu"
#define MOUNT_BLKIO_NAME "blkio"
#define MOUNT_CPUSET_NAME "cpuset"
#define MOUNT_CPUACCT_NAME "cpuacct"
#define MOUNT_MEMORY_NAME "memory"
#define GSCGROUP_CONF_DIR "etc"
#define GSCFG_BACKUP ".bak"
#define GSCGROUP_MOUNT_POINT "/sys/fs/cgroup"
#define GSCGROUP_MOUNT_POINT_OLD "/dev/cgroups"
#define GSCGROUP_ROOT "Root"
#define GSCGROUP_TOP_DATABASE "Gaussdb"
#define GSCGROUP_CM "CM"
#define GSCGROUP_TOP_BACKEND "Backend"
#define GSCGROUP_TOP_CLASS "Class"
#define GSCGROUP_DEFAULT_CLASS "DefaultClass"
#define GSCGROUP_REMAIN_WORKLOAD "RemainWD"
#define GSCGROUP_TOP_TIMESHARE "Timeshare"
#define GSCGROUP_RUSH_TIMESHARE "Rush"
#define GSCGROUP_HIGH_TIMESHARE "High"
#define GSCGROUP_MEDIUM_TIMESHARE "Medium"
#define GSCGROUP_LOW_TIMESHARE "Low"
#define GSCGROUP_DEFAULT_BACKEND "DefaultBackend"
#define GSCGROUP_VACUUM "Vacuum"
#define GSCGROUP_TOP_WORKLOAD "TopWD"
#define GSCGROUP_INVALID_GROUP "InvalidGroup"
#define GSCGROUP_DEFAULT_CGNAME GSCGROUP_MEDIUM_TIMESHARE
#define GSCGROUP_TOPNUM 4
#define GSCGROUP_BAKNUM 16
#define GSCGROUP_CLASSNUM 64
#define GSCGROUP_WDNUM_OLD 256
#define GSCGROUP_WDNUM 640
#define GSCGROUP_TSNUM 4
#define GSCGROUP_ALLNUM_OLD \
(GSCGROUP_TOPNUM + GSCGROUP_BAKNUM + GSCGROUP_CLASSNUM + GSCGROUP_WDNUM_OLD + GSCGROUP_TSNUM)
#define GSCGROUP_ALLNUM (GSCGROUP_TOPNUM + GSCGROUP_BAKNUM + GSCGROUP_CLASSNUM + GSCGROUP_WDNUM + GSCGROUP_TSNUM)
#define TS_RUSH_RATE 8
#define TS_HIGH_RATE 4
#define TS_MEDIUM_RATE 2
#define TS_LOW_RATE 1
#define TS_ALL_RATE (TS_RUSH_RATE + TS_HIGH_RATE + TS_MEDIUM_RATE + TS_LOW_RATE)
#define TOPCG_START_ID 0
#define TOPCG_ROOT 0
#define TOPCG_GAUSSDB 1
#define TOPCG_BACKEND 2
#define TOPCG_CLASS 3
#define TOPCG_END_ID (GSCGROUP_TOPNUM - 1)
#define BACKENDCG_START_ID (TOPCG_END_ID + 1)
#define BACKENDCG_END_ID (BACKENDCG_START_ID + GSCGROUP_BAKNUM - 1)
#define CLASSCG_START_ID (BACKENDCG_END_ID + 1)
#define CLASSCG_END_ID (CLASSCG_START_ID + GSCGROUP_CLASSNUM - 1)
#define WDCG_START_ID (CLASSCG_END_ID + 1)
#define WDCG_END_ID_OLD (WDCG_START_ID + GSCGROUP_WDNUM_OLD - 1)
#define WDCG_END_ID (WDCG_START_ID + GSCGROUP_WDNUM - 1)
#define TSCG_START_ID_OLD (WDCG_END_ID_OLD + 1)
#define TSCG_START_ID (WDCG_END_ID + 1)
#define TSCG_END_ID (TSCG_START_ID + GSCGROUP_TSNUM - 1)
#define DEFAULT_CPU_SHARES 1024
#define DEFAULT_GAUSS_CPUSHARES 5120
#define DEFAULT_CM_CPUSHARES 8192
#define MAX_CLASS_CPUSHARES 10000
#define TOP_BACKEND_PERCENT 40
#define TOP_CLASS_PERCENT (100 - TOP_BACKEND_PERCENT)
#define DEFAULT_BACKEND_PERCENT 80
#define VACUUM_PERCENT 20
#define DEFAULT_CLASS_PERCENT 20
#define OTHER_CLASS_PERCENT (100 - DEFAULT_CLASS_PERCENT)
#define DEFAULT_WORKLOAD_PERCENT 20
#define TOPWD_PERCENT 90
#define DEFAULT_CPU_PERIOD 100000
#define DEFAULT_IO_WEIGHT 500
#define MIN_IO_WEIGHT 100
#define MAX_IO_WEIGHT 1000
#define GROUP_ALL_PERCENT 100
#define TOPWD_PERCENT 90
#define NORMALWD_PERCENT 10
#define DEFAULT_CPUSKEWPCT 30
#define DEFAULT_QUALITIME 1800
#define GSCFG_PREFIX "gscgroup"
#define GSCFG_SUFFIX ".cfg"
#define WD_TOP_LEVEL 1
#define MAX_WD_LEVEL 10
#define GPNAME_LEN 64
#define IODATA_LEN 96
#define EXCEPT_LEN 256
#define USERNAME_LEN 56
#define GPNAME_PATH_LEN (GPNAME_LEN * (MAX_WD_LEVEL + 1))
#define PROCLINE_LEN 4096
#define CPUSET_OLD_LEN 8
#define CPUSET_LEN 64
#define SUBSYS_LEN 8
#define NANOSECS_PER_SEC ((int64)(1000000000))
#define IO_WEIGHT_CALC(weight, percent) \
((weight * (percent) / 100) > MIN_IO_WEIGHT ? (weight * (percent) / 100) : MIN_IO_WEIGHT)
#define CPU_SHARES "cpu.shares"
#define CPU_QUOTA "cpu.cfs_quota_us"
#define CPU_PERIOD "cpu.cfs_period_us"
#define BLKIO_WEIGHT "blkio.weight"
#define BLKIO_BPSREAD "blkio.throttle.read_bps_device"
#define BLKIO_IOPSREAD "blkio.throttle.read_iops_device"
#define BLKIO_BPSWRITE "blkio.throttle.write_bps_device"
#define BLKIO_IOPSWRITE "blkio.throttle.write_iops_device"
#define CPUSET_CPUS "cpuset.cpus"
#define CPUSET_MEMS "cpuset.mems"
#define CPUACCT_USAGE "cpuacct.usage"
#define EXCEPT_ALL_KINDS 2
#define EXCEPT_ERROR -2
#define EXCEPT_NONE -1
#define EXCEPT_ABORT 0
#define EXCEPT_PENALTY 1
#define EXCEPT_FLAG(eflag) ((eflag) + 1)
#define IS_EXCEPT_FLAG(eflag, except) ((eflag) == EXCEPT_FLAG((except)))
#define CGroupIsValid(group) (NULL != (void*)group && *group && strcmp(group, GSCGROUP_INVALID_GROUP) != 0)
#define CGroupIsDefault(group) (NULL != (void*)group && *group && strcmp(group, "DefaultClass:Medium") == 0)
#define V1R5_VERSION 1
#define V1R6_VERSION 2
* get the cpuset start value
* astart: the upper level cpu core start value
* aend: the upper level cpu core end value
* bsum: the sum of cpu cores except the current updating group
* bmax: the max cpu core of the same level other groups
* blen: the length of the cpu cores of the current updating group
*/
#define GET_CPUSET_START_VALUE(astart, aend, bsum, bmax, blen) \
(((bsum) + (blen) <= (aend) - (astart) + 1) \
? (((aend) - (bmax) >= (blen)) ? (((bsum) > 0) ? ((bmax) + 1) : (astart)) : ((astart) + (bsum))) \
: ((aend) - (blen) + 1))
typedef enum {
GROUP_NONE,
GROUP_TOP,
GROUP_CLASS,
not to control any threads */
GROUP_BAKWD,
GROUP_DEFWD,
to control the query threads in the given level */
GROUP_TSWD
to control the query threads in the bottom level */
} group_type;
typedef enum {
ALLOC_DYNAMIC,
ALLOC_FIXED
} alloc_type;
typedef struct {
int shares;
int weight;
int quota;
char iopsread[IODATA_LEN];
char iopswrite[IODATA_LEN];
char bpsread[IODATA_LEN];
char bpswrite[IODATA_LEN];
} alloc_old_info_t;
typedef struct {
int shares;
int weight;
int quota;
int spare;
char iopsread[IODATA_LEN];
char iopswrite[IODATA_LEN];
char bpsread[IODATA_LEN];
char bpswrite[IODATA_LEN];
} alloc_info_t;
typedef struct {
int percent;
} gscgroup_top_t;
typedef struct {
int tgid;
int maxlevel;
int percent;
int rempct;
} gscgroup_class_t;
typedef struct {
int cgid;
int wdlevel;
int percent;
} gscgroup_wd_t;
typedef struct {
int cgid;
int rate;
} gscgroup_ts_t;
typedef union {
gscgroup_top_t top;
gscgroup_class_t cls;
gscgroup_wd_t wd;
gscgroup_ts_t ts;
} group_info_t;
typedef struct {
unsigned int blocktime;
unsigned int elapsedtime;
unsigned int allcputime;
unsigned int qualitime;
unsigned int skewpercent;
unsigned int spare[5];
} except_old_data_t;
typedef struct {
unsigned int blocktime;
unsigned int elapsedtime;
unsigned int allcputime;
unsigned int qualitime;
unsigned int skewpercent;
unsigned int reserved;
int64 spoolsize;
int64 broadcastsize;
unsigned int spare[14];
} except_data_t;
typedef struct {
unsigned short used;
unsigned short gid;
group_type gtype;
group_info_t ginfo;
unsigned short percent;
char grpname[GPNAME_LEN];
alloc_old_info_t ainfo;
except_old_data_t except[EXCEPT_ALL_KINDS];
char cpuset[CPUSET_OLD_LEN];
int spare[3];
} gscgroup_old_grp_t;
typedef struct {
unsigned short used;
unsigned short gid;
group_type gtype;
group_info_t ginfo;
char grpname[GPNAME_LEN];
alloc_info_t ainfo;
except_data_t except[EXCEPT_ALL_KINDS];
char cpuset[CPUSET_LEN];
unsigned int percent;
int spare[63];
} gscgroup_grp_t;
typedef struct {
char name[GPNAME_LEN];
int percent;
int cpuUtil;
int cpuCount;
int usedCpuCount;
int64 cpuUsedAcct;
int64 cpuLastAcct;
TimestampTz lastTime;
struct cgroup* cg;
struct cgroup* oldcg;
} gscgroup_entry_t;
typedef struct {
gscgroup_entry_t entry;
char relpath[GPNAME_PATH_LEN];
char cpuset[CPUSET_LEN];
char nodegroup[GPNAME_LEN];
int shares;
bool valid;
} gscgroup_info_t;
typedef enum { GSCGROUP_NONE_STMT = 0, GSCGROUP_NORMAL_STMT, GSCGROUP_TOP_STMT, GSCGROUP_VACUUM_STMT } gscgroup_stmt_t;
extern long gsutil_filesize(const char* fname);
extern void* gsutil_filemap(const char* fname, size_t nbytes, int prot, int flags, struct passwd* passwd_user);
extern char* gscgroup_get_parent_wdcg_path(int cnt, gscgroup_grp_t* vaddr[GSCGROUP_ALLNUM], char* nodegroup = NULL);
extern char* gscgroup_get_topts_path(int cnt, gscgroup_grp_t* vaddr[GSCGROUP_ALLNUM], char* nodegroup = NULL);
extern char* gscgroup_get_relative_path(int cnt, gscgroup_grp_t* vaddr[GSCGROUP_ALLNUM], char* nodegroup = NULL);
extern char* gsutil_print_exception_flag(int eflag);
extern int gsutil_exception_is_valid(gscgroup_grp_t* grp, int kinds);
extern int gsutil_exception_kind_is_valid(gscgroup_grp_t* grp, int kind);
extern int gsutil_get_cpu_count();
#endif