// Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
// Package constant is grpc common types and functions
package constant
import "time"
// process signal type
const (
// KillMasterSignalType kill master agent
KillMasterSignalType = "killMaster"
// StopTrainSignalType stop train signal type
StopTrainSignalType = "stopTrain"
// GlobalFaultSignalType global fault ranks signal type
GlobalFaultSignalType = "globalFault"
// ChangeStrategySignalType change strategy signal type
ChangeStrategySignalType = "changeStrategy"
// SaveAndExitSignalType save and exit signal type
SaveAndExitSignalType = "saveAndExit"
// KeepAliveSignalType keep alive signal type
KeepAliveSignalType = "keep-alive"
// FaultNodesExitSignalType fault nodes exit signal type
FaultNodesExitSignalType = "faultNodesExit"
// HotSwitchSignalType hot switch
HotSwitchSignalType = "hot-switch"
// WaitStartAgentSignalType pause start train signal type
WaitStartAgentSignalType = "pauseStartAgent"
// ContinueStartAgentSignalType continue start train signal type
ContinueStartAgentSignalType = "continueStartAgent"
// PreExitProcessSignalType pre exit process signal type
PreExitProcessSignalType = "preExitProcess"
)
// recover strategy name
const (
// RecoverStrategies config in pod group label for supported strategy
RecoverStrategies = "recover-strategy"
// ProcessRetryStrategyName strategy name of HBM fault step retry
ProcessRetryStrategyName = "retry"
// ProcessRecoverStrategyName strategy name of process online recover
ProcessRecoverStrategyName = "recover"
// ProcessRecoverInPlaceStrategyName strategy name of recover in place with only restarting fault processes
ProcessRecoverInPlaceStrategyName = "recover-in-place"
// ProcessDumpStrategyName strategy name of save check point
ProcessDumpStrategyName = "dump"
// ProcessExitStrategyName strategy name of directly exit
ProcessExitStrategyName = "exit"
// ProcessContinueTrain continue train
ProcessContinueTrain = "continue"
// ElasticTrainingStrategyName strategy name of elastic-training
ElasticTrainingStrategyName = "elastic-training"
// ScaleInStrategyName strategy name of DP level scale-in training
ScaleInStrategyName = "downgrade"
// ScaleOutStrategyName strategy name of DP level scale-out recover training
ScaleOutStrategyName = "upgrade"
// JobReschedulingStrategyName is the name of job level rescheduling
JobReschedulingStrategyName = "job-rescheduling"
// JobReschedulingStrategyKey the key of job rescheduling strategy
JobReschedulingStrategyKey = "fault-scheduling"
// JobReschedulingStrategyGraceValue one of job rescheduling strategies' value
JobReschedulingStrategyGraceValue = "grace"
// JobReschedulingStrategyForceValue one of job rescheduling strategies' value
JobReschedulingStrategyForceValue = "force"
// PodReschedulingStrategyName is the name of pod level rescheduling
PodReschedulingStrategyName = "pod-rescheduling"
// PodReschedulingStrategyKey is the key of pod level rescheduling label
PodReschedulingStrategyKey = "pod-rescheduling"
// PodReschedulingStrategyOpenValue is the value of pod level rescheduling label that stands open
PodReschedulingStrategyOpenValue = "on"
// ProcessMigration migration , strategy used in hotswitch flow
ProcessMigration = "migration"
)
const (
// SubHealthyStrategy config in pod group label for subHealthy fault strategy
SubHealthyStrategy = "subHealthyStrategy"
// SubHealthyGraceExit strategy name of grace exit
SubHealthyGraceExit = "graceExit"
// SubHealthyIngore strategy name of ignore
SubHealthyIngore = "ignore"
// SubHealthyHotSwitch strategy name of hot switch
SubHealthyHotSwitch = "hotSwitch"
// HealthyState state of Healthy
HealthyState = "Healthy"
// UnHealthyState state of unHealthy
UnHealthyState = "UnHealthy"
// SubHealthyState state of subHealthy
SubHealthyState = "SubHealthy"
// PreSeparateState state of preSeparate
PreSeparateState = "PreSeparate"
)
const (
// ResetInfoDir dir for reset info
ResetInfoDir = "/user/restore/reset/"
// ResetInfoCMNamePrefix for reset configmap name prefix
ResetInfoCMNamePrefix = "reset-config-"
// ResetInfoCMDataKey for reset configmap data key
ResetInfoCMDataKey = "reset.json"
// ResetInfoCMCheckCodeKey for reset configmap checkcode key
ResetInfoCMCheckCodeKey = "checkCode"
// ResetTaskNameKey for obtain the reset task name
ResetTaskNameKey = "volcano.sh/job-name"
// ResetTaskNameKeyInLabel for obtain the reset task name when using operator
ResetTaskNameKeyInLabel = "training.kubeflow.org/job-name"
)
const (
// FaultRankStatus rank status is fault
FaultRankStatus = "fault"
// ProcessRecoverEnableLabel the process recover label of pg
ProcessRecoverEnableLabel = "process-recover-enable"
// ProcessRecoverEnable open process recover
ProcessRecoverEnable = "on"
// ProcessRecoverPause close process recover temporarily
ProcessRecoverPause = "pause"
// ProcessRecoverInit init state before real open process-recover-enable
ProcessRecoverInit = ""
)
// write reset configmap operation
const (
// RestartAllProcessOperation add reset.json retryTimes which trigger agent restart all process
RestartAllProcessOperation = "restartAllProcess"
// ClearOperation reset resetConfigMap
ClearOperation = "clear"
// NotifyFaultListOperation write fault list to reset.json
NotifyFaultListOperation = "fault"
// NotifyFaultFlushingOperation notify agent fault occur and wait fault flush finished
NotifyFaultFlushingOperation = "notifyFaultFlushing"
)
const (
// MaxUuidRandomLength max uuid random length
MaxUuidRandomLength = 32
// CheckPGRunningRetryTimes check pg change running state retry times
CheckPGRunningRetryTimes = 54
// CheckPodReschedulingTimes check pg change running state for exit strategy
CheckPodReschedulingTimes = 4
// SleepSecondBeforeCheckPGRunning check pg state interval
SleepSecondBeforeCheckPGRunning = 5
// WaitAgentGetFaultRank wait agent get fault rank interval
WaitAgentGetFaultRank = 5
// WriteResetInfoRetryTimes retry set reset configmap
WriteResetInfoRetryTimes = 3
// WaitProcessRestart sleep 60 second
WaitProcessRestart = 60
// ProcessRecoverStrategy pg label control process recover continue
ProcessRecoverStrategy = "ProcessRecoverStrategy"
// ProcessConfirmFaultKey pg annotation key store fault rank
ProcessConfirmFaultKey = "ProcessConfirmFault"
// ProcessResultFaultKey pg annotation key store final fault rank
ProcessResultFaultKey = "ProcessResultFault"
// ProcessRecoverStatusKey process recover status
ProcessRecoverStatusKey = "ProcessRecoverStatus"
// RankTableReadyKey pg annotation key store whether rank table ready
RankTableReadyKey = "RankTableReady"
// CheckPeriod sleep when process not ready
CheckPeriod = 3
// ProcessControlTimeout wait process annotation until timeout
ProcessControlTimeout = 300
// JobFaultDisappearRetryTimes wait fault disappear retry times
JobFaultDisappearRetryTimes = 5
// JobFaultCheckPeriod job fault check period
JobFaultCheckPeriod = 3
// RetrySuccess retry success
RetrySuccess = "retry-success"
// RetryFailed retry failed
RetryFailed = "retry-failed"
// RecoverSuccess process recover success
RecoverSuccess = "recover-success"
// RecoverFailed process recover failed
RecoverFailed = "recover-failed"
// DumpSuccess save ckpt success
DumpSuccess = "dump-success"
// DumpFailed save ckpt fail
DumpFailed = "dump-failed"
// ExitCompleted exit strategy finish
ExitCompleted = "exit-completed"
// MaxEventChanLen max event chan len
MaxEventChanLen = 100
// DumpExit dump exit
DumpExit = "dump_exit"
// TpBlock ra-block,Changed from "tp-block" to "ra-block"
TpBlock = "ra-block"
)
const (
// GetPodGroupTimes get pod group times
GetPodGroupTimes = 3
// UpdatePodGroupTimes get pod group times
UpdatePodGroupTimes = 3
// MaxServeJobs max serve job num for fault recover
MaxServeJobs = 10000
// QpsLimit max qps for grpc service
QpsLimit = 1000
// RecoverGrpcProbe grpc probe of Recover service
RecoverGrpcProbe = "/Recover/HealthCheck"
// BusinessGrpcReq business grpc request for all service
BusinessGrpcReq = "BusinessGrpcReq"
)
const (
// UceFaultType uce fault type
UceFaultType = "0"
// HcclFaultType uce fault type
HcclFaultType = "2"
// NormalFaultType other uce type
NormalFaultType = "1"
// HotResetPolicy hot reset policy
HotResetPolicy = "reset"
// RestartPolicy restart process policy
RestartPolicy = "restart"
)
// FaultLevel string describe
const (
// NotHandleFault not handle fault
NotHandleFault = "NotHandleFault"
// RestartRequest restart request
RestartRequest = "RestartRequest"
// RestartBusiness restart business
RestartBusiness = "RestartBusiness"
// RestartNPU restart NPU
RestartNPU = "RestartNPU"
// FreeRestartNPU wait free and restart NPU
FreeRestartNPU = "FreeRestartNPU"
// SeparateNPU separate NPU
SeparateNPU = "SeparateNPU"
// NormalNPU normal NPU
NormalNPU = "NormalNPU"
// NormalNetwork normal network
NormalNetwork = "NormalNetwork"
// PreSeparateNPU pre separate NPU
PreSeparateNPU = "PreSeparateNPU"
// ManuallySeparateNPU Manually Separate NPU
ManuallySeparateNPU = "ManuallySeparateNPU"
// CardUnhealthy fault is caused by card unhealthy
CardUnhealthy = "CardUnhealthy"
// CardNetworkUnhealthy fault is caused by card network unhealthy
CardNetworkUnhealthy = "CardNetworkUnhealthy"
// SubHealthFault sub healthy fault
SubHealthFault = "SubHealthFault"
// PreSeparateFault pre-separate fault
PreSeparateFault = "PreSeparateFault"
// SeparateFault separate fault
SeparateFault = "SeparateFault"
)
// switch fault level
const (
// NotHandleFaultLevel NotHandle Fault Level
NotHandleFaultLevel = 0
// SubHealthFaultLevel SubHealth Fault Level
SubHealthFaultLevel = 1
// RestartRequestFaultLevel RestartRequest Fault Level
RestartRequestFaultLevel = 2
// PreSeparateFaultLevel PreSeparate Fault Level
PreSeparateFaultLevel = 3
// SeparateFaultLevel Separate Fault Level
SeparateFaultLevel = 4
// NotHandleFaultLevelStr NotHandle Fault Level Str
NotHandleFaultLevelStr = "NotHandle"
// SubHealthFaultLevelStr SubHealth Fault Level Str
SubHealthFaultLevelStr = "SubHealthFault"
// RestartRequestFaultLevelStr RestartRequest Fault Level Str
RestartRequestFaultLevelStr = "RestartRequest"
// PreSeparateFaultLevelStr PreSeparate Fault Level Str
PreSeparateFaultLevelStr = "PreSeparate"
// SeparateFaultLevelStr Separate Fault Level Str
SeparateFaultLevelStr = "Separate"
// ResetFaultStr is the string for ResetFault
ResetFaultStr = "ResetFault"
// RestartRequestFaultStr is the string for RestartRequestFault
RestartRequestFaultStr = "RestartRequestFault"
)
// support device type
const (
UnknownResourceType = "unknown"
)
const (
// InvalidSuperPodIndex invalid super pod index
InvalidSuperPodIndex = -2
// PatchPodTimes patch pod retry times
PatchPodTimes = 3
// PatchPodGroupTimes patch pod group retry times
PatchPodGroupTimes = 3
// PatchNodeTimes patch node retry times
PatchNodeTimes = 3
// AllCardId all card id
AllCardId = "FF"
// SwitchFaultType is switchFault
SwitchFaultType = "switchFault"
// DeviceFaultType is deviceFault
DeviceFaultType = "deviceFault"
// TaskFaultKey is fault-type
TaskFaultKey = "fault-type"
// Kilo is 1000
Kilo = 1000
// FaultCustomizationPath fault customization path
FaultCustomizationPath = "/home/hwMindX/relationFaultCustomization.json"
// FaultDurationPath fault duration path
FaultDurationPath = "/home/hwMindX/faultDuration.json"
)
const (
PtFramework = "pytorch"
MsFramework = "mindspore"
)
const (
Success = "success"
Failed = "failed"
Start = "start"
)
const (
// CardDropFault is the fault code of card drop fault
CardDropFault = "40F84E00"
)
const (
// NodeHealthyStatusKey node healthy status key
NodeHealthyStatusKey = "NodeHealthyStatus"
// NodeUnHealthy in this case pod will be rescheduling
NodeUnHealthy = "UnHealthy"
// StressTestOK stress test ok
StressTestOK = "0"
// StressTestExecFail stress test exec fail
StressTestExecFail = "1"
// StressTestFindFault stress test find fault
StressTestFindFault = "2"
// StressTestTimeout value of stress test timeout
StressTestTimeout = "3"
// StressTestVolRecoverFail voltage recovery failed
StressTestVolRecoverFail = "4"
)
const (
// FaultNodesExitAction action to notify fault nodes to exit
FaultNodesExitAction = "fault_nodes_exit"
// FaultNodesRestartAction action to notify fault nodes to restart
FaultNodesRestartAction = "fault_nodes_restart"
// OnGlobalRankAction on_global_rank action
OnGlobalRankAction = "on_global_rank"
// ContinueStartAgent continue start agent action
ContinueStartAgent = "continue_start_agent"
// PauseStartAgent pause start agent action
PauseStartAgent = "pause_start_agent"
// StopAction stop_train action
StopAction = "stop_train"
// PauseTrainAction pause_train action
PauseTrainAction = "pause_train"
// ChangeStrategyAction change_strategy action
ChangeStrategyAction = "change_strategy"
// SaveAndExitAction save_and_exit action
SaveAndExitAction = "save_and_exit"
// HotSwitchAction hot switch action
HotSwitchAction = "hot switch"
// StopSwitchAction stop switch action
StopSwitchAction = "stop switch"
// NewPodRunningAction new pod running action
NewPodRunningAction = "new pod running"
// PreExitProcessAction pre exit process action
PreExitProcessAction = "pre exit process"
// DefaultWaitRescheduleTimeout default reschedule timeout before executing arf or dp scale-in strategy
// (Unit: second)
DefaultWaitRescheduleTimeout = 270
// MinWaitRescheduleTimeout min reschedule timeout before executing arf or dp scale-in strategy (Unit: second)
MinWaitRescheduleTimeout = 30
// WaitRescheduleTimeoutKey is the key of WaitRescheduleTimeout
WaitRescheduleTimeoutKey = "wait-reschedule-timeout"
// DefaultWaitRescheduleTimeoutBeforeDeployStrategy is the waiting pod reschedule timeout when ARF is closed but
// scale-train and pod/job reschedule strategy is open
DefaultWaitRescheduleTimeoutBeforeDeployStrategy = 20
// MindIOWaitTimeKey is the key of wait time before deploy strategy
MindIOWaitTimeKey = "MINDIO_WAIT_MINDX_TIME"
// MindIOWaitTimeMax is the max wait time (Unit: second)
MindIOWaitTimeMax = 3600
// DifferenceTime is the difference with timeout env
DifferenceTime = 10
// RankZeroNodeId is "0"
RankZeroNodeId = "0"
// PodRankIndexAnno annotation value is rank index of the pod
PodRankIndexAnno = "hccl/rankIndex"
// CheckCtlStateTimes check control state times
CheckCtlStateTimes = 10
)
const (
// RoleFdAgent is the grpc role name of Fault-Diag online
RoleFdAgent = "FdAgent"
// MsgCacheNumPerClient is the max number of msg cache per client
MsgCacheNumPerClient = 10
// MaxClientPerRole is the max number of client per role
MaxClientPerRole = 20
// MaxNPUsPerBatch is the max number of npus per batch
MaxNPUsPerBatch = 40000
// RequestNumPerSecondLimit is the max number of request per second
RequestNumPerSecondLimit = 20
)
const (
// StatusNone status none
StatusNone = 0
// StatusHasSoftFault fault status has tag of soft fault
StatusHasSoftFault = 1 << iota
// StatusHasHardwareFault fault status has tag of hardware fault
StatusHasHardwareFault
)
const (
// FailedReasonJobNoFault job has no fault
FailedReasonJobNoFault = "job no fault"
// FailedReasonParseRankError parse rank error
FailedReasonParseRankError = "parse rank error"
// FailedReasonPodRankNoFault pod rank has no fault
FailedReasonPodRankNoFault = "pod rank no fault"
// FailedReasonHasOtherFault pod has other fault
FailedReasonHasOtherFault = "has other fault"
// FailedReasonShouldReport should report fault
FailedReasonShouldReport = "should report fault"
// FailedReasonFaultTimeOut fault timeout
FailedReasonFaultTimeOut = "fault time out"
)
const (
// CustomFilterFaultCodeAnnoKey custom filter fault code annotation key
CustomFilterFaultCodeAnnoKey = "huawei.com/schedule.filter.faultCode"
// CustomFilterFaultLevelAnnoKey custom filter fault level annotation key
CustomFilterFaultLevelAnnoKey = "huawei.com/schedule.filter.faultLevel"
// CustomFilterFaultDefaultTimeout custom filter fault default time out
CustomFilterFaultDefaultTimeout = 60 * time.Second
// EachFaultFilterConfigMaxLen max length of each fault filter config
EachFaultFilterConfigMaxLen = 2
)
const (
// ConfigCmName the name of config cm
ConfigCmName = "clusterd-config-cm"
// ManualDevInfoCmName manual device info cm name
ManualDevInfoCmName = "clusterd-manual-info-cm"
// ManuallySeparateNPUConfigKey the key of manually separate npu config in cm
ManuallySeparateNPUConfigKey = "manually_separate_policy.conf"
// HoursToMilliseconds hours to milliseconds
HoursToMilliseconds = 60 * 60 * 1000
// SecondsToMilliseconds seconds to milliseconds
SecondsToMilliseconds = 1000
// DefaultSlidingWindow default sliding window size
DefaultSlidingWindow = 30
// TimeIntervalForFaultReport time interval for report fault
// After detecting faults, MindIO requires a delay of three heartbeat intervals(2 seconds each)
// before reporting the fault
TimeIntervalForFaultReport = 10 * time.Second
// MaxTimestampRecords max number of records in timestamp slices
MaxTimestampRecords = 1000
)