Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package main
import (
"context"
"flag"
"fmt"
"syscall"
"ascend-common/api"
"ascend-common/common-utils/agreement"
"ascend-common/common-utils/healthz"
"ascend-common/common-utils/hwlog"
fdol "ascend-faultdiag-online"
"nodeD/pkg/common"
snapshot "nodeD/pkg/containersnapshot"
"nodeD/pkg/control"
"nodeD/pkg/device"
"nodeD/pkg/kubeclient"
"nodeD/pkg/monitoring"
"nodeD/pkg/monitoring/config"
"nodeD/pkg/pingmesh"
"nodeD/pkg/processmanager"
"nodeD/pkg/releaser"
"nodeD/pkg/reporter"
"nodeD/pkg/watcher/configmap"
)
const (
defaultLogFile = "/var/log/mindx-dl/noded/noded.log"
defaultReportInterval = 5
defaultMonitorPeriod = 60
maxReportInterval = 300
minReportInterval = 0
maxMonitorPeriod = 600
minMonitorPeriod = 60
maxLineLength = 512
fdConfigPath = "/usr/local/fdConfig.yaml"
)
var (
hwLogConfig = &hwlog.LogConfig{
LogFileName: defaultLogFile,
MaxLineLength: maxLineLength,
}
controller = &control.ControllerManager{}
configManager = &config.FaultConfigurator{}
monitorManager = &monitoring.MonitorManager{}
reportManager = &reporter.ReportManager{}
pingmeshManager *pingmesh.Manager
podMonitor = &snapshot.PodMonitor{}
version bool
BuildVersion string
BuildName string
reportInterval int
monitorPeriod int
resultMaxAge int
deviceResetTimeout int
hzFlags = healthz.RegisterFlags()
)
func main() {
flag.Parse()
if version {
fmt.Printf("%s version: %s \n", BuildName, BuildVersion)
return
}
ctx, cancel := context.WithCancel(context.Background())
if err := hwlog.InitRunLogger(hwLogConfig, ctx); err != nil {
fmt.Printf("hwlog init failed, error is %v\n", err)
return
}
if err := hzFlags.Serve(ctx); err != nil {
hwlog.RunLog.Errorf("failed to start healthz server: %v", err)
return
}
if !checkParameters() {
return
}
hwlog.RunLog.Infof("%s starting and the version is %s", BuildName, BuildVersion)
setParameters()
if err := createWorkers(); err != nil {
hwlog.RunLog.Errorf("create workers failed, err is %v", err)
return
}
if err := initFunction(ctx); err != nil {
hwlog.RunLog.Errorf("init function failed, err is %v", err)
return
}
go monitorManager.Run(ctx)
go configmap.GetCmWatcher().Watch(ctx.Done())
initPodInformer()
if pingmeshManager != nil {
go pingmeshManager.Run(ctx)
}
fdol.StartFDOnline(fdConfigPath, []string{"slowNode"}, "node")
signalCatch(cancel)
}
func init() {
agreement.PrintAgreement()
flag.BoolVar(&version, "version", false, "the version of the program")
flag.IntVar(&reportInterval, "reportInterval", defaultReportInterval,
"Min interval of report node status")
flag.IntVar(&monitorPeriod, "monitorPeriod", defaultMonitorPeriod, "Monitoring period of monitor ,"+
"range [60,600] seconds")
flag.IntVar(&hwLogConfig.LogLevel, "logLevel", 0,
"Log level, -1-debug, 0-info, 1-warning, 2-error, 3-critical(default 0)")
flag.IntVar(&hwLogConfig.MaxAge, "maxAge", hwlog.DefaultMinSaveAge,
"Maximum number of days for backup run log files, range [7, 700] days")
flag.StringVar(&hwLogConfig.LogFileName, "logFile", defaultLogFile,
"Run log file path. if the file size exceeds 20MB, will be rotated")
flag.IntVar(&hwLogConfig.MaxBackups, "maxBackups", hwlog.DefaultBackups,
"Maximum number of backup operation logs, range is (0, 180]")
flag.IntVar(&resultMaxAge, "resultMaxAge", pingmesh.DefaultResultMaxAge,
"Maximum number of days for backup run pingmesh result files, range [7, 700] days")
flag.IntVar(&deviceResetTimeout, api.DeviceResetTimeout, api.DefaultDeviceResetTimeout,
"when noded starts, if the number of chips is insufficient, the maximum duration to wait for "+
"the driver to report all chips, unit second, range [10, 600]")
}
func checkParameters() bool {
if reportInterval <= minReportInterval || reportInterval > maxReportInterval {
hwlog.RunLog.Errorf("report interval %d out of range (0,300]", reportInterval)
return false
}
if monitorPeriod < minMonitorPeriod || monitorPeriod > maxMonitorPeriod {
hwlog.RunLog.Errorf("monitor period %d out of range [60,600]", monitorPeriod)
return false
}
if resultMaxAge < pingmesh.MinResultMaxAge || resultMaxAge > pingmesh.MaxResultMaxAge {
hwlog.RunLog.Errorf("resultMaxAge %d out of range [%d,%d]", resultMaxAge, pingmesh.MinResultMaxAge,
pingmesh.MaxResultMaxAge)
return false
}
if deviceResetTimeout < api.MinDeviceResetTimeout || deviceResetTimeout > api.MaxDeviceResetTimeout {
hwlog.RunLog.Errorf("deviceResetTimeout %d out of range [%d,%d]", deviceResetTimeout,
api.MinDeviceResetTimeout, api.MaxDeviceResetTimeout)
return false
}
return true
}
func setParameters() {
common.ParamOption = common.Option{
ReportInterval: reportInterval,
MonitorPeriod: monitorPeriod,
DeviceResetTimeout: deviceResetTimeout,
}
}
func createWorkers() error {
clientK8s, err := kubeclient.NewClientK8s()
if err != nil {
hwlog.RunLog.Errorf("init k8s client failed when start, error: %v", err)
return err
}
configmap.InitCmWatcher(clientK8s)
if err = device.InitDeviceManager(); err != nil {
hwlog.RunLog.Errorf("init device manager failed when start, error: %v", err)
return err
}
controller = control.NewControlManager(clientK8s)
monitorManager = monitoring.NewMonitorManager(clientK8s)
reportManager = reporter.NewReporterManager(clientK8s)
pingmeshManager = pingmesh.NewManager(&pingmesh.Config{
ResultMaxAge: resultMaxAge,
KubeClient: clientK8s,
})
releaser.InitReleaser()
configmap.GetCmWatcher().Init()
monitorManager.SetNextFaultProcessor(controller)
controller.SetNextFaultProcessor(reportManager)
return nil
}
func initFunction(ctx context.Context) error {
if err := processmanager.InitPlugin(ctx); err != nil {
hwlog.RunLog.Errorf("init controller failed when start, err is %v", err)
return err
}
go func() {
if err := controller.InitNodeAnnotation(); err != nil {
hwlog.RunLog.Warnf("init node annotation failed when start, err is %v", err)
}
}()
return nil
}
func initPodInformer() {
if kubeclient.GetK8sClient() == nil {
hwlog.RunLog.Errorf("k8s client is nil")
return
}
podMonitor = snapshot.NewPodMonitor(kubeclient.GetK8sClient())
go podMonitor.Monitoring()
}
func signalCatch(cancel context.CancelFunc) {
osSignalChan := common.NewSignalWatcher(syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT, syscall.SIGKILL)
if osSignalChan == nil {
hwlog.RunLog.Error("create stop signal channel failed")
return
}
select {
case sig, sigEnd := <-osSignalChan:
if !sigEnd {
hwlog.RunLog.Info("catch system stop signal channel is closed")
return
}
hwlog.RunLog.Infof("receive system signal: %s, NodeD shutting down", sig.String())
cancel()
configManager.Stop()
monitorManager.Stop()
podMonitor.Stop()
}
}