Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package common
import (
"encoding/json"
"fmt"
"sort"
"strconv"
"strings"
"sync"
"time"
"golang.org/x/time/rate"
"k8s.io/apimachinery/pkg/util/sets"
"ascend-common/common-utils/hwlog"
"ascend-common/common-utils/utils"
"ascend-common/devmanager/common"
"ascend-common/devmanager/hccn"
)
const (
NotHandleFault = "NotHandleFault"
RestartRequest = "RestartRequest"
RestartBusiness = "RestartBusiness"
RestartNPU = "RestartNPU"
FreeRestartNPU = "FreeRestartNPU"
SeparateNPU = "SeparateNPU"
NormalNPU = "NormalNPU"
NormalNetwork = "NormalNetwork"
PreSeparateNPU = "PreSeparateNPU"
ManuallySeparateNPU = "ManuallySeparateNPU"
CardUnhealthy = "CardUnhealthy"
CardNetworkUnhealthy = "CardNetworkUnhealthy"
LinkDownFaultCode int64 = 0x81078603
UBPortDownCode int64 = 0x81B18603
UBOEPortDownCode int64 = 0x81078607
UBSeparateFaultCode int64 = 0x020001002
UBSubHealFaultCode int64 = 0x020000002
UBOEPreSeparateFaultCode int64 = 0x110001024
UBOESubHealFaultCode int64 = 0x110000002
ResetFinishFaultCode int64 = 0x8C2FA009
CardDropFaultCode int64 = 0x40F84E00
faultCodeFilePath = "/usr/local/faultCode.json"
faultCustomizationFilePath = "/usr/local/faultCustomization.json"
switchFaultCodeFilePath = "/usr/local/SwitchFaultCode.json"
halfDivisor = 2
WaitNpuReadyTime time.Duration = 30
WaitErrorCodeCleanTime time.Duration = 30
WaitProcessesToZeroTime time.Duration = 60
ResetInterVal time.Duration = 5
PollingInterval time.Duration = DefaultPollingInterval
SubHealthFault = "SubHealthFault"
NotHandleFaultCodesStr = "NotHandleFaultCodes"
SubHealthFaultCodesStr = "SubHealthFaultCodes"
RestartRequestFaultCodesStr = "RestartRequestFaultCodes"
PreSeparateFaultCodesStr = "PreSeparateFaultCodes"
SeparateFaultCodesStr = "SeparateFaultCodes"
)
var (
faultTypeCode = FaultTypeCode{}
NotHandleFaultCodes = make([]string, 0, GeneralMapSize)
SubHealthFaultCodes = make([]string, 0, GeneralMapSize)
RestartRequestFaultCodes = make([]string, 0, GeneralMapSize)
PreSeparateFaultCodes = make([]string, 0, GeneralMapSize)
SeparateFaultCodes = make([]string, 0, GeneralMapSize)
initLogicIDs []int32
logicIDLock sync.Mutex
recoverFaultMap = make(map[int32][]int64, GeneralMapSize)
recoverNetworkFaultMap = make(map[int32][]int64, GeneralMapSize)
recoverFaultFrequencyMap = make(map[int32]string, GeneralMapSize)
devFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize)
devFaultInfoMapLock sync.Mutex
SubscribeFailed bool
SwitchSubscribeFailed bool
Synchronize bool
manuallySeparateNpuMapLock sync.Mutex
manuallySeparateNpuMap = make(map[int32]ManuallyFaultInfo, GeneralMapSize)
FaultTypeSet = sets.NewString(NotHandleFault, RestartRequest, RestartBusiness, FreeRestartNPU,
RestartNPU, PreSeparateNPU, SeparateNPU, ManuallySeparateNPU, SubHealthFault)
FaultDurationTypeSet = sets.NewString(NotHandleFault, RestartRequest, RestartBusiness, FreeRestartNPU,
RestartNPU, PreSeparateNPU, SeparateNPU, SubHealthFault)
NetworkFaultCodes = sets.NewInt64(LinkDownFaultCode, UBOEPortDownCode, UBOESubHealFaultCode, UBOEPreSeparateFaultCode)
HyperPlaneFaultCodes = sets.NewInt64(UBPortDownCode, UBSeparateFaultCode, UBSubHealFaultCode)
limiter = rate.NewLimiter(rate.Every(1*time.Minute/FaultCallBackRateLimit), FaultCallBackRateLimit)
)
var (
WaitProcessReadCMTime time.Duration = DefaultProcessReadCMTime
WaitFaultSelfHealingTime time.Duration = DefaultWaitFaultSelfHealingTime
WaitDeviceResetTime time.Duration = DefaultWaitDeviceResetTime
faultFrequencyMap = make(map[string]*FaultFrequencyCache, common.MaxErrorCodeCount)
faultFrequencyMapLock sync.RWMutex
faultDurationMap = make(map[string]*FaultDurationCache, common.MaxErrorCodeCount)
faultDurationMapLock sync.RWMutex
faultSeverityMap = make(map[int64]int8, common.MaxErrorCodeCount)
parseHexFailedMsg = "parse hex int failed and skip it, string: %s"
networkFaultConfigureFailedMsg = "%x is a network fault and cannot be configured to %s now, " +
"fault handling policy is set to NotHandleFault"
hbmTool = NewHbmFaultManager()
autoFillReasonReleaseTimeWindow int64 = 0
UBOEPreciseFaultCodesMap = map[int64]sets.Int64{
UBOEPortDownCode: sets.NewInt64(UBOEPortDownCode, UBOEPreSeparateFaultCode, UBOESubHealFaultCode),
}
UBPreciseFaultCodesMap = map[int64]sets.Int64{
UBPortDownCode: sets.NewInt64(UBPortDownCode, UBSeparateFaultCode, UBSubHealFaultCode),
}
)
func copyFaultFrequencyConfig() map[string]FaultFrequency {
faultFrequencyMapLock.RLock()
defer faultFrequencyMapLock.RUnlock()
result := make(map[string]FaultFrequency, len(faultFrequencyMap))
for k, v := range faultFrequencyMap {
result[k] = v.FaultFrequency
}
return result
}
func copyFaultDurationConfig() map[string]FaultDuration {
faultDurationMapLock.RLock()
defer faultDurationMapLock.RUnlock()
result := make(map[string]FaultDuration, len(faultDurationMap))
for k, v := range faultDurationMap {
result[k] = v.FaultDuration
}
return result
}
type ManuallyFaultInfo struct {
LogicID int32
FirstHandle bool
RecordTime int64
}
type FaultTypeCode struct {
NotHandleFaultCodes []int64
RestartRequestCodes []int64
RestartBusinessCodes []int64
RestartNPUCodes []int64
FreeRestartNPUCodes []int64
PreSeparateNPUCodes []int64
SeparateNPUCodes []int64
NotHandleFaultNetworkCodes []int64
PreSeparateNPUNetworkCodes []int64
SeparateNPUNetworkCodes []int64
SubHealthFaultCodes []int64
}
type faultFileInfo struct {
NotHandleFaultCodes []string
RestartRequestCodes []string
RestartBusinessCodes []string
RestartNPUCodes []string
FreeRestartNPUCodes []string
SeparateNPUCodes []string
PreSeparateNPUCodes []string
NotHandleFaultNetworkCodes []string
PreSeparateNPUNetworkCodes []string
SeparateNPUNetworkCodes []string
SubHealthFaultCodes []string
}
type SwitchFaultFileInfo struct {
NotHandleFaultCodes []string
SubHealthFaultCodes []string
RestartRequestFaultCodes []string
PreSeparateFaultCodes []string
ResetFaultCodes []string
SeparateFaultCodes []string
}
type FaultCustomization struct {
GraceTolerance GraceToleranceCustomization
FaultFrequency []FaultFrequencyCustomization
FaultDuration []FaultDurationCustomization
}
type GraceToleranceCustomization struct {
WaitProcessReadCMTime int64
WaitDeviceResetTime int64
WaitFaultSelfHealingTime int64
}
type FaultFrequencyCustomization struct {
EventId []string
FaultFrequency
}
type FaultFrequencyCache struct {
Frequency map[int32][]int64
LastFaultTime map[int32]int64
LastFaultRecoverTime map[int32]int64
FaultFrequency
}
type FaultFrequency struct {
TimeWindow int64
Times int64
FaultHandling string
ReleaseTimeWindow int64
}
type FaultDurationCustomization struct {
EventId []string
FaultDuration
}
type FaultDurationCache struct {
Duration map[int32]FaultDurationData
FaultDuration
}
type FaultDurationData struct {
TimeoutStatus bool
FaultEventQueue []common.DevFaultInfo
FaultDurationTime int64
FaultRecoverDurationTime int64
FaultAlarmTime int64
}
type FaultDuration struct {
FaultTimeout int64
RecoverTimeout int64
FaultHandling string
}
type handleDurationInputPara struct {
logicID int32
eventId string
index int
timeoutStatus bool
duration int64
faultAlarmTime int64
}
func isA950CardType() bool {
return ParamOption.RealCardType == Ascend910A5
}
type FaultHandlingStep struct {
Name string
Do func()
}
type faultCategoryFilter struct {
name string
matches func(eventID int64) bool
}
var faultCategoryFilters = []faultCategoryFilter{
{name: ParameterPlaneFaultKey, matches: func(eventID int64) bool { return NetworkFaultCodes.Has(eventID) }},
{name: HyperPlaneFaultKey, matches: func(eventID int64) bool { return HyperPlaneFaultCodes.Has(eventID) }},
{name: ChipFaultKey, matches: func(eventID int64) bool { return true }},
}
func ClassifyFaultInfos(faultInfos []common.DevFaultInfo) map[string][]common.DevFaultInfo {
result := map[string][]common.DevFaultInfo{}
for _, f := range faultCategoryFilters {
result[f.name] = make([]common.DevFaultInfo, 0)
}
for _, fi := range faultInfos {
for _, f := range faultCategoryFilters {
if f.matches(fi.EventID) {
result[f.name] = append(result[f.name], fi)
break
}
}
}
return result
}
func getChipFaultPreSteps(logicID int32, chipFaultInfos []common.DevFaultInfo) []FaultHandlingStep {
return []FaultHandlingStep{}
}
func getBaseChipFaultSteps(logicID int32, chipFaultInfos []common.DevFaultInfo,
curFaultCodesMap sets.Int64, device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "baseChipFaultRecover", Do: func() { baseChipFaultRecover(logicID, chipFaultInfos, curFaultCodesMap, device) }},
{Name: "baseChipFaultOccur", Do: func() { baseChipFaultOccur(chipFaultInfos, device) }},
}
}
func getA950ChipFaultSteps(logicID int32, chipFaultInfos []common.DevFaultInfo,
curFaultCodesMap sets.Int64, device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "a950ChipFaultRecover", Do: func() { a950ChipFaultRecover(logicID, chipFaultInfos, curFaultCodesMap, device) }},
{Name: "a950ChipFaultOccur", Do: func() { a950ChipFaultOccur(chipFaultInfos, device) }},
}
}
func getChipFaultPostSteps(device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "updateAlarmTime", Do: func() { setAlarmRaisedTime(device) }},
}
}
func getParameterPlaneFaultPreSteps(logicID int32, chipFaultInfos []common.DevFaultInfo) []FaultHandlingStep {
return []FaultHandlingStep{}
}
func getBaseParameterPlaneFaultSteps(logicID int32, networkFaultInfos []common.DevFaultInfo,
device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "baseParameterPlaneFaultRecover", Do: func() {
baseParameterPlaneFaultRecover(logicID, networkFaultInfos, device)
}},
{Name: "baseParameterPlaneFaultOccur", Do: func() {
baseParameterPlaneFaultOccur(networkFaultInfos, device)
}},
}
}
func getA950ParameterPlaneFaultSteps(logicID int32, networkFaultInfos []common.DevFaultInfo,
device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "a950ParameterPlaneFaultRecover", Do: func() {
a950ParameterPlaneFaultRecover(logicID, networkFaultInfos, device)
}},
{Name: "a950ParameterPlaneFaultOccur", Do: func() {
a950ParameterPlaneFaultOccur(logicID, networkFaultInfos, device)
}},
}
}
func getParameterPlaneFaultPostSteps(device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "updateNetworkAlarmTime", Do: func() { setNetworkAlarmRaisedTime(device) }},
}
}
func getHyperPlaneFaultPreSteps(logicID int32, hyperPlaneFaultInfos []common.DevFaultInfo) []FaultHandlingStep {
return []FaultHandlingStep{}
}
func getA950HyperPlaneFaultSteps(logicID int32, hyperPlaneFaultInfos []common.DevFaultInfo,
device *NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "a950HyperPlaneFaultRecover", Do: func() {
a950HyperPlaneFaultRecover(logicID, hyperPlaneFaultInfos, device)
}},
{Name: "a950HyperPlaneFaultOccur", Do: func() {
a950HyperPlaneFaultOccur(logicID, hyperPlaneFaultInfos, device)
}},
}
}
func getHyperPlaneOverallFaultPreSteps(devices []*NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{}
}
func getA950HyperPlaneNewOverallFaultSteps(devices []*NpuDevice) []FaultHandlingStep {
return []FaultHandlingStep{
{Name: "a950HyperPlaneNewOverallFaultModify", Do: func() {
a950HyperPlaneNewOverallFaultModify(devices)
}},
}
}
type DevFaultInfoBasedTimeAscend []common.DevFaultInfo
func (devFault DevFaultInfoBasedTimeAscend) Len() int {
return len(devFault)
}
func (devFault DevFaultInfoBasedTimeAscend) Swap(i, j int) {
if i >= len(devFault) || j >= len(devFault) {
hwlog.RunLog.Errorf("index out of range, i: %d, j: %d, length: %d", i, j, len(devFault))
return
}
devFault[i], devFault[j] = devFault[j], devFault[i]
}
func (devFault DevFaultInfoBasedTimeAscend) Less(i, j int) bool {
if i >= len(devFault) || j >= len(devFault) {
hwlog.RunLog.Errorf("index out of range, i: %d, j: %d, length: %d", i, j, len(devFault))
return false
}
return devFault[i].AlarmRaisedTime < devFault[j].AlarmRaisedTime
}
type HbmFaultManager struct {
HbmOccurTimeCache map[int32]int64
AicFaultEventQue map[int32][]common.DevFaultInfo
}
func NewHbmFaultManager() *HbmFaultManager {
return &HbmFaultManager{
HbmOccurTimeCache: make(map[int32]int64, GeneralMapSize),
AicFaultEventQue: make(map[int32][]common.DevFaultInfo, GeneralMapSize),
}
}
func (h *HbmFaultManager) updateHbmOccurTime(faultInfo common.DevFaultInfo) {
h.HbmOccurTimeCache[faultInfo.LogicID] = faultInfo.AlarmRaisedTime
hwlog.RunLog.Debugf("npu memory fault occur, device %d update occur time: %d",
faultInfo.LogicID, h.HbmOccurTimeCache[faultInfo.LogicID])
}
func (h *HbmFaultManager) aicFaultEventInQue(faultInfo common.DevFaultInfo) {
_, ok := h.AicFaultEventQue[faultInfo.LogicID]
if !ok {
h.AicFaultEventQue[faultInfo.LogicID] = []common.DevFaultInfo{}
}
h.AicFaultEventQue[faultInfo.LogicID] = append(h.AicFaultEventQue[faultInfo.LogicID], faultInfo)
sort.Sort(DevFaultInfoBasedTimeAscend(h.AicFaultEventQue[faultInfo.LogicID]))
hwlog.RunLog.Debugf("aic/aiv fault event %d in que, device %d new event que:%#v",
faultInfo.EventID, faultInfo.LogicID, h.AicFaultEventQue[faultInfo.LogicID])
}
func (h *HbmFaultManager) aicFaultEventOutQue(logicId int32) []common.DevFaultInfo {
faultInfoList := make([]common.DevFaultInfo, 0)
faultEventQue, ok := h.AicFaultEventQue[logicId]
if !ok {
return faultInfoList
}
if _, ok := h.HbmOccurTimeCache[logicId]; !ok {
h.HbmOccurTimeCache[logicId] = 0
}
newFaultEventQue := make([]common.DevFaultInfo, 0)
nowTime := time.Now().UnixMilli()
for i := 0; i < len(faultEventQue); i++ {
if Int64Tool.Abs(h.HbmOccurTimeCache[logicId], faultEventQue[i].AlarmRaisedTime) <
AssociatedFaultDiagnosisTime*TimeMilliseconds {
hwlog.RunLog.Infof("device %d delete event in fault event que, aic event time %d ,"+
"npu memory event time %d", logicId, faultEventQue[i].AlarmRaisedTime, h.HbmOccurTimeCache[logicId])
continue
}
if nowTime-faultEventQue[i].AlarmRaisedTime > AssociatedFaultDiagnosisTime*TimeMilliseconds {
hwlog.RunLog.Infof("device % delete event in fault event que, aic event time %d now time %d",
logicId, faultEventQue[i].AlarmRaisedTime, nowTime)
faultInfoList = append(faultInfoList, faultEventQue[i])
continue
}
newFaultEventQue = append(newFaultEventQue, faultEventQue[i])
}
h.AicFaultEventQue[logicId] = newFaultEventQue
return faultInfoList
}
func LoadFaultCodeFromFile() error {
faultCodeBytes, err := utils.LoadFile(faultCodeFilePath)
if err != nil {
return fmt.Errorf("load fault code json failed: %v", err)
}
return LoadFaultCode(faultCodeBytes)
}
func LoadSwitchFaultCodeFromFile() error {
switchFaultsBytes, err := utils.LoadFile(switchFaultCodeFilePath)
if err != nil {
return fmt.Errorf("load switch fault code failed: %v", err)
}
return LoadSwitchFaultCode(switchFaultsBytes)
}
func LoadFaultCustomizationFromFile() error {
faultCodeBytes, err := utils.LoadFile(faultCustomizationFilePath)
if err != nil {
return fmt.Errorf("load fault customization json failed: %v", err)
}
if err = LoadFaultCustomization(faultCodeBytes); err != nil {
return err
}
return nil
}
func ResetFaultCustomizationCache() {
hwlog.RunLog.Debug("reset fault customization, fault customization cache will be cleared")
faultFrequencyMapLock.Lock()
faultFrequencyMap = make(map[string]*FaultFrequencyCache, common.MaxErrorCodeCount)
faultFrequencyMapLock.Unlock()
faultDurationMapLock.Lock()
faultDurationMap = make(map[string]*FaultDurationCache, common.MaxErrorCodeCount)
faultDurationMapLock.Unlock()
}
func LoadFaultCode(faultCodeBytes []byte) error {
var fileInfo faultFileInfo
if err := json.Unmarshal(faultCodeBytes, &fileInfo); err != nil {
return fmt.Errorf("unmarshal fault code byte failed: %v", err)
}
faultTypeCode = FaultTypeCode{
NotHandleFaultCodes: StringTool.HexStringToInt(fileInfo.NotHandleFaultCodes),
RestartRequestCodes: StringTool.HexStringToInt(fileInfo.RestartRequestCodes),
RestartBusinessCodes: StringTool.HexStringToInt(fileInfo.RestartBusinessCodes),
RestartNPUCodes: StringTool.HexStringToInt(fileInfo.RestartNPUCodes),
FreeRestartNPUCodes: StringTool.HexStringToInt(fileInfo.FreeRestartNPUCodes),
PreSeparateNPUCodes: StringTool.HexStringToInt(fileInfo.PreSeparateNPUCodes),
SeparateNPUCodes: StringTool.HexStringToInt(fileInfo.SeparateNPUCodes),
NotHandleFaultNetworkCodes: StringTool.HexStringToInt(fileInfo.NotHandleFaultNetworkCodes),
PreSeparateNPUNetworkCodes: StringTool.HexStringToInt(fileInfo.PreSeparateNPUNetworkCodes),
SeparateNPUNetworkCodes: StringTool.HexStringToInt(fileInfo.SeparateNPUNetworkCodes),
SubHealthFaultCodes: StringTool.HexStringToInt(fileInfo.SubHealthFaultCodes),
}
mappingChipFaultToNetworkFaultCodesSupport()
mappingChipFaultToNetworkFaultCodesNotSupport()
return nil
}
func mappingChipFaultToNetworkFaultCodesSupport() {
for _, faultCode := range faultTypeCode.NotHandleFaultCodes {
if NetworkFaultCodes.Has(faultCode) {
faultTypeCode.NotHandleFaultNetworkCodes = append(faultTypeCode.NotHandleFaultNetworkCodes, faultCode)
}
}
for _, faultCode := range faultTypeCode.PreSeparateNPUCodes {
if NetworkFaultCodes.Has(faultCode) {
faultTypeCode.PreSeparateNPUNetworkCodes = append(faultTypeCode.PreSeparateNPUNetworkCodes, faultCode)
}
}
for _, faultCode := range faultTypeCode.SeparateNPUCodes {
if NetworkFaultCodes.Has(faultCode) {
faultTypeCode.SeparateNPUNetworkCodes = append(faultTypeCode.SeparateNPUNetworkCodes, faultCode)
}
}
}
func mappingChipFaultToNetworkFaultCodesNotSupport() {
for _, faultCode := range faultTypeCode.RestartRequestCodes {
if NetworkFaultCodes.Has(faultCode) {
hwlog.RunLog.Warnf(networkFaultConfigureFailedMsg, faultCode, RestartRequest)
faultTypeCode.NotHandleFaultNetworkCodes = append(faultTypeCode.NotHandleFaultNetworkCodes, faultCode)
}
}
for _, faultCode := range faultTypeCode.RestartBusinessCodes {
if NetworkFaultCodes.Has(faultCode) {
hwlog.RunLog.Warnf(networkFaultConfigureFailedMsg, faultCode, RestartBusiness)
faultTypeCode.NotHandleFaultNetworkCodes = append(faultTypeCode.NotHandleFaultNetworkCodes, faultCode)
}
}
for _, faultCode := range faultTypeCode.RestartNPUCodes {
if NetworkFaultCodes.Has(faultCode) {
hwlog.RunLog.Warnf(networkFaultConfigureFailedMsg, faultCode, RestartNPU)
faultTypeCode.NotHandleFaultNetworkCodes = append(faultTypeCode.NotHandleFaultNetworkCodes, faultCode)
}
}
for _, faultCode := range faultTypeCode.FreeRestartNPUCodes {
if NetworkFaultCodes.Has(faultCode) {
hwlog.RunLog.Warnf(networkFaultConfigureFailedMsg, faultCode, FreeRestartNPU)
faultTypeCode.NotHandleFaultNetworkCodes = append(faultTypeCode.NotHandleFaultNetworkCodes, faultCode)
}
}
}
func LoadFaultCustomization(faultCustomizationByte []byte) error {
var faultCustomization FaultCustomization
if err := json.Unmarshal(faultCustomizationByte, &faultCustomization); err != nil {
hwlog.RunLog.Errorf("load fault customization failed, unmarshal err: %v", err)
return err
}
loadGraceToleranceCustomization(faultCustomization.GraceTolerance)
loadFaultFrequencyCustomization(faultCustomization.FaultFrequency)
setAutofillReasonReleaseTime()
loadFaultDurationCustomization(faultCustomization.FaultDuration)
frequencyConfig := copyFaultFrequencyConfig()
durationConfig := copyFaultDurationConfig()
checkAndUpdateExistingUpgradeFaults(frequencyConfig, durationConfig)
return nil
}
func loadValidSwitchFaultCode(codes []string, target *[]string, codeType string) {
for _, code := range codes {
if !isValidSwitchFaultCode(code) {
hwlog.RunLog.Warnf("failed to parse %s faultCode:%v, will ignore it,"+
" please check if its format, such as: [0x00f1ff09,155914,cpu,na]", codeType, code)
continue
}
*target = append(*target, code)
}
}
func LoadSwitchFaultCode(switchFaultCodeByte []byte) error {
var switchFileInfo SwitchFaultFileInfo
if err := json.Unmarshal(switchFaultCodeByte, &switchFileInfo); err != nil {
return fmt.Errorf("failed to unmarshal switch fault code, err: %s", err.Error())
}
NotHandleFaultCodes = make([]string, 0, GeneralMapSize)
SubHealthFaultCodes = make([]string, 0, GeneralMapSize)
RestartRequestFaultCodes = make([]string, 0, GeneralMapSize)
PreSeparateFaultCodes = make([]string, 0, GeneralMapSize)
SeparateFaultCodes = make([]string, 0, GeneralMapSize)
switchFileInfo.SeparateFaultCodes = append(switchFileInfo.SeparateFaultCodes, switchFileInfo.ResetFaultCodes...)
faultGroups := []struct {
source []string
target *[]string
name string
}{
{switchFileInfo.NotHandleFaultCodes, &NotHandleFaultCodes, NotHandleFaultCodesStr},
{switchFileInfo.SubHealthFaultCodes, &SubHealthFaultCodes, SubHealthFaultCodesStr},
{switchFileInfo.RestartRequestFaultCodes, &RestartRequestFaultCodes, RestartRequestFaultCodesStr},
{switchFileInfo.PreSeparateFaultCodes, &PreSeparateFaultCodes, PreSeparateFaultCodesStr},
{switchFileInfo.SeparateFaultCodes, &SeparateFaultCodes, SeparateFaultCodesStr},
}
for _, group := range faultGroups {
loadValidSwitchFaultCode(group.source, group.target, group.name)
}
return nil
}
func isValidSwitchFaultCode(code string) bool {
if len(code) > MaxLengthOfFaultCode {
return false
}
if !strings.HasPrefix(code, "[") || !strings.HasSuffix(code, "]") {
return false
}
parts := strings.Split(code, CommaSepDev)
return len(parts) == PartNumOfFaultCode
}
func loadFaultDurationCustomization(customization []FaultDurationCustomization) {
handledEventId := make(sets.String, common.MaxErrorCodeCount)
for _, cus := range customization {
if !validateFaultDurationCustomization(cus) {
continue
}
for _, id := range cus.EventId {
id = strings.ToLower(id)
if handledEventId.Has(id) {
hwlog.RunLog.Warnf("duplicated event id detected when handling FaultDuration, skip, "+
"event id: %s", id)
continue
}
handledEventId.Insert(id)
if cache, ok := faultDurationMap[id]; ok {
cache.FaultTimeout = cus.FaultTimeout
cache.RecoverTimeout = cus.RecoverTimeout
cache.FaultHandling = cus.FaultHandling
hwlog.RunLog.Debugf("update FaultDuration for event id %s success, FaultTimeout: %d, "+
"RecoverTimeout: %d, FaultHandling: %s", id, cus.FaultTimeout, cus.RecoverTimeout,
cus.FaultHandling)
} else {
faultDurationMap[id] = &FaultDurationCache{
Duration: make(map[int32]FaultDurationData, GeneralMapSize),
FaultDuration: FaultDuration{
FaultTimeout: cus.FaultTimeout,
RecoverTimeout: cus.RecoverTimeout,
FaultHandling: cus.FaultHandling,
},
}
hwlog.RunLog.Debugf("insert FaultDuration for event id %s success, FaultTimeout: %d, "+
"RecoverTimeout: %d, FaultHandling: %s", id, cus.FaultTimeout, cus.RecoverTimeout,
cus.FaultHandling)
}
}
}
cachedEventIds := make([]string, 0, len(faultDurationMap))
for k := range faultDurationMap {
cachedEventIds = append(cachedEventIds, k)
}
for _, cachedId := range cachedEventIds {
if !handledEventId.Has(cachedId) && len(cachedId) != 0 {
delete(faultDurationMap, cachedId)
hwlog.RunLog.Infof("delete FaultDuration for event id %s", cachedId)
}
}
}
func loadGraceToleranceCustomization(customization GraceToleranceCustomization) {
if customization.WaitDeviceResetTime < MinWaitDeviceResetTime ||
customization.WaitDeviceResetTime > MaxWaitDeviceResetTime {
hwlog.RunLog.Errorf("WaitDeviceResetTime(%d) exceed limit(%d~%d), use default(%d)",
customization.WaitDeviceResetTime, MinWaitDeviceResetTime,
MaxWaitDeviceResetTime, DefaultWaitDeviceResetTime)
WaitDeviceResetTime = DefaultWaitDeviceResetTime
} else {
hwlog.RunLog.Debugf("modify WaitDeviceResetTime(%d) success", customization.WaitDeviceResetTime)
WaitDeviceResetTime = time.Duration(customization.WaitDeviceResetTime)
}
if customization.WaitProcessReadCMTime < MinWaitProcessReadCMTime || customization.
WaitProcessReadCMTime > MaxWaitProcessReadCMTime {
hwlog.RunLog.Errorf("WaitProcessReadCMTime(%d) exceed limit(%d~%d), use default(%d)",
customization.WaitProcessReadCMTime, MinWaitProcessReadCMTime,
MaxWaitProcessReadCMTime, DefaultProcessReadCMTime)
WaitProcessReadCMTime = DefaultProcessReadCMTime
} else {
hwlog.RunLog.Debugf("modify WaitProcessReadCMTime(%d) success", customization.WaitProcessReadCMTime)
WaitProcessReadCMTime = time.Duration(customization.WaitProcessReadCMTime)
}
if customization.WaitFaultSelfHealingTime < MinWaitFaultSelfHealingTime ||
time.Duration(customization.WaitFaultSelfHealingTime) > MaxWaitFaultSelfHealingTime {
hwlog.RunLog.Errorf("WaitFaultSelfHealingTime(%d) exceed limit(%d~%d), use default(%d)",
customization.WaitFaultSelfHealingTime,
MinWaitFaultSelfHealingTime, WaitProcessReadCMTime, DefaultWaitFaultSelfHealingTime)
WaitFaultSelfHealingTime = DefaultWaitFaultSelfHealingTime
} else {
hwlog.RunLog.Debugf("modify WaitFaultSelfHealingTime(%d) success", customization.WaitFaultSelfHealingTime)
WaitFaultSelfHealingTime = time.Duration(customization.WaitFaultSelfHealingTime)
}
}
func setAutofillReasonReleaseTime() {
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
if autoFillReasonReleaseTimeWindow != 0 {
hwlog.RunLog.Warnf("AutoFillReasonReleaseTimeWindow has been set, "+
"current value is %v", autoFillReasonReleaseTimeWindow)
return
}
autoFillReasonReleaseTimeWindow = 0
for _, cache := range faultFrequencyMap {
if cache.ReleaseTimeWindow == MaxReleaseTimeWindow {
continue
}
if autoFillReasonReleaseTimeWindow < cache.ReleaseTimeWindow {
autoFillReasonReleaseTimeWindow = cache.ReleaseTimeWindow
}
}
if autoFillReasonReleaseTimeWindow == 0 {
autoFillReasonReleaseTimeWindow = MaxReleaseTimeWindow
}
hwlog.RunLog.Infof("AutoFillReasonReleaseTimeWindow is %v", autoFillReasonReleaseTimeWindow)
}
func GetAutofillReasonReleaseTime() int64 {
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
return autoFillReasonReleaseTimeWindow
}
func loadFaultFrequencyCustomization(customizations []FaultFrequencyCustomization) {
handledEventId := make(sets.String, GeneralMapSize)
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
for _, cus := range customizations {
if !validateFaultFrequencyCustomization(&cus) {
continue
}
for _, id := range cus.EventId {
id = strings.ToLower(id)
if handledEventId.Has(id) {
hwlog.RunLog.Warnf("duplicated event id detected when handling FaultFrequency, "+
"skip, event id: %s", id)
continue
}
handledEventId.Insert(id)
if cache, ok := faultFrequencyMap[id]; ok {
cache.TimeWindow = cus.TimeWindow
cache.Times = cus.Times
cache.FaultHandling = cus.FaultHandling
cache.ReleaseTimeWindow = cus.ReleaseTimeWindow
hwlog.RunLog.Debugf("update FaultFrequency for event id %s success, TimeWindow: %d, "+
"Times: %d, FaultHandling: %s", id, cus.TimeWindow, cus.Times, cus.FaultHandling)
} else {
faultFrequencyMap[id] = &FaultFrequencyCache{
Frequency: make(map[int32][]int64, common.MaxErrorCodeCount),
LastFaultTime: make(map[int32]int64),
LastFaultRecoverTime: make(map[int32]int64),
FaultFrequency: FaultFrequency{
TimeWindow: cus.TimeWindow,
Times: cus.Times,
FaultHandling: cus.FaultHandling,
ReleaseTimeWindow: cus.ReleaseTimeWindow,
},
}
hwlog.RunLog.Debugf("insert FaultFrequency for event id %s success: %v", id, cus)
}
}
}
cachedEventIds := make([]string, 0, len(faultFrequencyMap))
for k := range faultFrequencyMap {
cachedEventIds = append(cachedEventIds, k)
}
for _, cachedId := range cachedEventIds {
if !handledEventId.Has(cachedId) && len(cachedId) != 0 {
delete(faultFrequencyMap, cachedId)
hwlog.RunLog.Infof("delete FaultFrequency for event id %s", cachedId)
}
}
}
func insertFrequencyFaultOccur(logicId int32, eventId int64, faultTime int64) {
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
eventIdStr := strings.ToLower(strconv.FormatInt(eventId, Hex))
frequencyCache, ok := faultFrequencyMap[eventIdStr]
if !ok {
hwlog.RunLog.Debugf("skip inserting event id %s to fault frequency cache, no config found", eventIdStr)
return
}
_, ok = frequencyCache.Frequency[logicId]
if !ok {
frequencyCache.Frequency[logicId] = make([]int64, 0, frequencyCache.Times)
}
if faultTime == 0 {
faultTime = time.Now().UnixMilli()
}
frequencyCache.Frequency[logicId] = append(frequencyCache.Frequency[logicId], faultTime)
frequencyCache.LastFaultTime[logicId] = faultTime
hwlog.RunLog.Infof("insert fault frequency success, event id: %s, logic id: %d, fault time: %d, "+
"occurrence times :%d", eventIdStr, logicId, faultTime, len(frequencyCache.Frequency[logicId]))
}
func insertFrequencyFaultRecover(logicId int32, eventId int64, faultRecoverTime int64) {
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
eventIdStr := strings.ToLower(strconv.FormatInt(eventId, Hex))
frequencyCache, ok := faultFrequencyMap[eventIdStr]
if !ok {
hwlog.RunLog.Debugf("skip inserting event id %s to fault frequency cache, no config found", eventIdStr)
return
}
if faultRecoverTime == 0 {
faultRecoverTime = time.Now().UnixMilli()
}
frequencyCache.LastFaultRecoverTime[logicId] = faultRecoverTime
hwlog.RunLog.Infof("insert fault frequency success, event id: %s, logic id: %d, fault recover time: %d, "+
"occurrence times :%d", eventIdStr, logicId, faultRecoverTime, len(frequencyCache.Frequency[logicId]))
}
func validateFaultFrequencyCustomization(customization *FaultFrequencyCustomization) bool {
if len(customization.EventId) == 0 {
hwlog.RunLog.Warnf("empty event id in this FaultFrequency, skip")
return false
}
invalidMsg := "FaultFrequency configuration of this part will be invalid"
if customization.TimeWindow > MaxFaultFrequencyTimeWindow || customization.TimeWindow < MinFaultFrequencyTimeWindow {
hwlog.RunLog.Warnf("EventIDs: %v, TimeWindow(%d) in this FaultFrequency exceeds limit(%d~%d). %s",
customization.EventId, customization.TimeWindow, MinFaultFrequencyTimeWindow, MaxFaultFrequencyTimeWindow,
invalidMsg)
return false
}
if customization.ReleaseTimeWindow == 0 {
customization.ReleaseTimeWindow = MaxReleaseTimeWindow
}
if customization.ReleaseTimeWindow > MaxReleaseTimeWindow || customization.ReleaseTimeWindow < MinReleaseTimeWindow {
hwlog.RunLog.Warnf("EventIDs: %v, ReleaseTimeWindow(%d) in this FaultFrequency exceeds limit(%d~%d). %s",
customization.EventId, customization.ReleaseTimeWindow, MinReleaseTimeWindow, MaxReleaseTimeWindow,
invalidMsg)
return false
}
if customization.Times > MaxFaultFrequencyTimes || customization.Times < MinFaultFrequencyTimes {
hwlog.RunLog.Warnf("EventIDs: %v, Times(%d) in this FaultFrequency exceeds limit(%d~%d). %s",
customization.EventId, customization.Times, MinFaultFrequencyTimes, MaxFaultFrequencyTimes, invalidMsg)
return false
}
if customization.FaultHandling != ManuallySeparateNPU && customization.ReleaseTimeWindow == MaxReleaseTimeWindow {
hwlog.RunLog.Warnf(
"EventIDs: %v, FaultHandling(%s) in this FaultFrequency without ReleaseTimeWindow is not support. %s",
customization.EventId, customization.FaultHandling, invalidMsg)
return false
}
if !FaultTypeSet.Has(customization.FaultHandling) {
hwlog.RunLog.Warnf("EventIDs: %v, FaultHandling(%s) in this FaultFrequency is unrecognized. "+
"The supported range of FaultHandling in this FaultFrequency is %v. %s",
customization.EventId, customization.FaultHandling, FaultTypeSet.List(), invalidMsg)
return false
}
return true
}
func validateFaultDurationCustomization(faultDurationCustomization FaultDurationCustomization) bool {
if len(faultDurationCustomization.EventId) == 0 {
hwlog.RunLog.Warnf("empty event id in this FaultDuration, skip")
return false
}
invalidMsg := "FaultDuration configuration of this part will be invalid"
if faultDurationCustomization.FaultTimeout > MaxFaultTimeout ||
faultDurationCustomization.FaultTimeout < MinFaultTimeout {
hwlog.RunLog.Warnf("EventIDs: %v, FaultTimeout(%d) in this FaultDuration exceeds limit(%d~%d). %s",
faultDurationCustomization.EventId, faultDurationCustomization.FaultTimeout,
MinFaultTimeout, MaxFaultTimeout, invalidMsg)
return false
}
if faultDurationCustomization.RecoverTimeout > MaxRecoverTimeout ||
faultDurationCustomization.RecoverTimeout < MinRecoverTimeout {
hwlog.RunLog.Warnf("EventIDs: %v, RecoverTimeout(%d) in this FaultDuration exceeds limit(%d~%d). %s",
faultDurationCustomization.EventId, faultDurationCustomization.RecoverTimeout,
MinRecoverTimeout, MaxRecoverTimeout, invalidMsg)
return false
}
if !FaultDurationTypeSet.Has(faultDurationCustomization.FaultHandling) {
hwlog.RunLog.Warnf("EventIDs: %v, FaultHandling(%s) in this FaultDuration is unrecognized. "+
"The supported range of FaultHandling in this FaultDuration is %v. %s", faultDurationCustomization.EventId,
faultDurationCustomization.FaultHandling, FaultDurationTypeSet.List(), invalidMsg)
return false
}
return true
}
func GetNetworkFaultTypeByCode(faultCodes []int64) string {
if len(faultCodes) == 0 {
return NormalNetwork
}
if len(faultTypeCode.NotHandleFaultCodes) == 0 && len(faultTypeCode.PreSeparateNPUNetworkCodes) == 0 {
if err := LoadFaultCodeFromFile(); err != nil {
return PreSeparateNPU
}
}
switch {
case Int64Tool.SameElement(faultTypeCode.SeparateNPUNetworkCodes, faultCodes):
return SeparateNPU
case Int64Tool.SameElement(faultTypeCode.PreSeparateNPUNetworkCodes, faultCodes):
return PreSeparateNPU
case Int64Tool.SameElement(faultTypeCode.NotHandleFaultNetworkCodes, faultCodes):
return NotHandleFault
default:
hwlog.RunLog.Debugf("not record fault code : %v, use default type PreSeparateNPU", faultCodes)
return PreSeparateNPU
}
}
func GetFaultType(faultCodes []int64, logicId int32) string {
newFaultCodes := make([]int64, 0)
for _, faultCode := range faultCodes {
if !NetworkFaultCodes.Has(faultCode) {
newFaultCodes = append(newFaultCodes, faultCode)
}
}
faultTypes := make([]string, 0, len(FaultTypeSet))
faultTypes = append(faultTypes, GetFaultTypeByCode(newFaultCodes))
faultTypes = append(faultTypes, GetFaultTypeFromFaultFrequency(logicId, ChipFaultMode))
faultTypes = append(faultTypes, GetFaultTypeFromFaultDuration(logicId, ChipFaultMode))
faultLevelAndTime := GetUpgradeFaultLevelAndTime(logicId, ChipFaultMode)
for _, levelAndTime := range faultLevelAndTime {
faultTypes = append(faultTypes, levelAndTime.FaultLevel)
}
return getMostSeriousFaultType(faultTypes)
}
func GetNetworkFaultType(faultCodes []int64, logicId int32) string {
newNetworkFaultCodes := make([]int64, 0)
for _, faultCode := range faultCodes {
if NetworkFaultCodes.Has(faultCode) {
newNetworkFaultCodes = append(newNetworkFaultCodes, faultCode)
}
}
faultTypes := make([]string, 0, len(FaultTypeSet))
faultTypes = append(faultTypes, GetNetworkFaultTypeByCode(newNetworkFaultCodes))
faultTypes = append(faultTypes, GetFaultTypeFromFaultFrequency(logicId, NetworkFaultMode))
faultTypes = append(faultTypes, GetFaultTypeFromFaultDuration(logicId, NetworkFaultMode))
faultLevelAndTime := GetUpgradeFaultLevelAndTime(logicId, NetworkFaultMode)
for _, levelAndTime := range faultLevelAndTime {
faultTypes = append(faultTypes, levelAndTime.FaultLevel)
}
return getMostSeriousFaultType(faultTypes)
}
func GetFaultTypeByCode(faultCodes []int64) string {
if len(faultCodes) == 0 {
return NormalNPU
}
switch {
case Int64Tool.SameElement(faultTypeCode.SeparateNPUCodes, faultCodes):
return SeparateNPU
case Int64Tool.SameElement(faultTypeCode.PreSeparateNPUCodes, faultCodes):
return PreSeparateNPU
case Int64Tool.SameElement(faultTypeCode.RestartNPUCodes, faultCodes):
return RestartNPU
case Int64Tool.SameElement(faultTypeCode.FreeRestartNPUCodes, faultCodes):
return FreeRestartNPU
case Int64Tool.SameElement(faultTypeCode.RestartBusinessCodes, faultCodes):
return RestartBusiness
case Int64Tool.SameElement(faultTypeCode.RestartRequestCodes, faultCodes):
return RestartRequest
case Int64Tool.SameElement(faultTypeCode.NotHandleFaultCodes, faultCodes):
return NotHandleFault
case Int64Tool.SameElement(faultTypeCode.SubHealthFaultCodes, faultCodes):
return SubHealthFault
default:
faultType := getFaultTypeBySeverity(faultCodes)
hwlog.RunLog.Debugf("not record fault code: %v, get fault type by severity: %s", faultCodes, faultType)
return faultType
}
}
func GetFaultTypeFromFaultFrequency(logicId int32, mode string) string {
if mode != ChipFaultMode && mode != NetworkFaultMode {
return NormalNPU
}
faultTypes := make([]string, 0, len(faultFrequencyMap))
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
for eventId, frequencyCache := range faultFrequencyMap {
num, err := strconv.ParseInt(eventId, Hex, 0)
if err != nil {
hwlog.RunLog.Errorf(parseHexFailedMsg, eventId)
continue
}
if (mode == ChipFaultMode && NetworkFaultCodes.Has(num)) ||
(mode == NetworkFaultMode && !NetworkFaultCodes.Has(num)) {
continue
}
_, ok := frequencyCache.Frequency[logicId]
if !ok {
continue
}
faultTypes = handleFrequencyFault(logicId, frequencyCache, eventId)
}
return getMostSeriousFaultType(faultTypes)
}
func handleFrequencyFault(logicId int32, frequencyCache *FaultFrequencyCache, eventId string) []string {
faultTypes := make([]string, 0)
timeWindowStart := time.Now().Unix() - frequencyCache.TimeWindow
index := 0
for _, occurrenceTime := range frequencyCache.Frequency[logicId] {
if occurrenceTime < timeWindowStart*SecondMagnification {
hwlog.RunLog.Infof("delete the expired fault occurrence, event id: %s, logic id: %d, "+
"time window start: %d, occurrence time: %d", eventId, logicId, timeWindowStart, occurrenceTime)
index++
} else {
break
}
}
lastFaultTime := frequencyCache.LastFaultTime[logicId]
lastRecoverTime := frequencyCache.LastFaultRecoverTime[logicId]
frequencyCache.Frequency[logicId] = frequencyCache.Frequency[logicId][index:]
lenFrequencyCache := len(frequencyCache.Frequency[logicId])
if int64(lenFrequencyCache) >= frequencyCache.Times {
hwlog.RunLog.Infof("FaultFrequency detected, event id: %s, logic id: %d, fault occurred times: %d, "+
"fault level: %s, faultTimes: %v", eventId, logicId, lenFrequencyCache, frequencyCache.FaultHandling,
frequencyCache.Frequency[logicId])
if frequencyCache.FaultHandling == ManuallySeparateNPU {
hwlog.RunLog.Infof("detect ManuallySeparateNPU, logic id: %d", logicId)
SaveManuallyFaultInfo(logicId)
}
faultTypes = append(faultTypes, frequencyCache.FaultHandling)
recoverFaultFrequencyMap[logicId] = eventId
InsertUpgradeFaultCache(LogicId(logicId), lastFaultTime, eventId,
frequencyCache.FaultHandling, FrequencyUpgradeType)
} else {
if lastRecoverTime >= lastFaultTime &&
time.Now().UnixMilli()-lastRecoverTime > frequencyCache.ReleaseTimeWindow*SecondMagnification {
RemoveTimeoutReasonCache(LogicId(logicId), CodeMatcher(eventId), TypeMatcher(FrequencyUpgradeType))
} else {
if CheckUpgradeFaultCache(LogicId(logicId), eventId, frequencyCache.FaultHandling, FrequencyUpgradeType) {
InsertUpgradeFaultCache(LogicId(logicId), lastFaultTime, eventId,
frequencyCache.FaultHandling, FrequencyUpgradeType)
recoverFaultFrequencyMap[logicId] = eventId
}
}
}
return faultTypes
}
func GetFaultTypeFromFaultDuration(logicId int32, mode string) string {
if mode != ChipFaultMode && mode != NetworkFaultMode {
return NormalNPU
}
faultDurationMapLock.Lock()
defer faultDurationMapLock.Unlock()
faultTypes := make([]string, 0, len(faultDurationMap))
for eventId, faultDurationCache := range faultDurationMap {
num, err := strconv.ParseInt(eventId, Hex, 0)
if err != nil {
hwlog.RunLog.Errorf(parseHexFailedMsg, eventId)
continue
}
if (mode == ChipFaultMode && NetworkFaultCodes.Has(num)) ||
(mode == NetworkFaultMode && !NetworkFaultCodes.Has(num)) {
continue
}
faultDurationData, ok := faultDurationCache.Duration[logicId]
if !ok {
continue
}
if faultDurationData.TimeoutStatus {
hwlog.RunLog.Debugf("FaultDuration detected, event id: %s, logic id: %d, "+
"fault duration time: %.2f seconds, "+
"fault level: %s", eventId, logicId,
float64(faultDurationData.FaultDurationTime)/SecondMagnificationFloat,
faultDurationCache.FaultHandling)
faultTypes = append(faultTypes, faultDurationCache.FaultHandling)
InsertUpgradeFaultCache(LogicId(logicId), faultDurationData.FaultAlarmTime, eventId,
faultDurationCache.FaultHandling, DurationUpgradeType)
} else {
if faultDurationData.FaultRecoverDurationTime > faultDurationCache.RecoverTimeout*SecondMagnification {
RemoveTimeoutReasonCache(LogicId(logicId), CodeMatcher(eventId), TypeMatcher(DurationUpgradeType))
}
}
}
return getMostSeriousFaultType(faultTypes)
}
func getFaultTypeBySeverity(faultCodes []int64) string {
for _, code := range faultCodes {
severity, ok := faultSeverityMap[code]
if !ok {
hwlog.RunLog.Warnf("detect unknown fault code and no match severity: %d", code)
return SeparateNPU
}
if severity > FaultSeverityMinor {
return SeparateNPU
}
}
return NotHandleFault
}
func getMostSeriousFaultType(fautTypes []string) string {
faultTypeSet := sets.NewString(fautTypes...)
if faultTypeSet.Has(ManuallySeparateNPU) {
return ManuallySeparateNPU
} else if faultTypeSet.Has(SeparateNPU) {
return SeparateNPU
} else if faultTypeSet.Has(PreSeparateNPU) {
return PreSeparateNPU
} else if faultTypeSet.Has(RestartNPU) {
return RestartNPU
} else if faultTypeSet.Has(FreeRestartNPU) {
return FreeRestartNPU
} else if faultTypeSet.Has(RestartBusiness) {
return RestartBusiness
} else if faultTypeSet.Has(RestartRequest) {
return RestartRequest
} else if faultTypeSet.Has(SubHealthFault) {
return SubHealthFault
} else if faultTypeSet.Has(NotHandleFault) {
return NotHandleFault
}
return NormalNPU
}
func SetDeviceInit(logicID int32) {
logicIDLock.Lock()
defer logicIDLock.Unlock()
if Int32Tool.Contains(initLogicIDs, logicID) {
return
}
initLogicIDs = append(initLogicIDs, logicID)
}
func GetAndCleanLogicID() []int32 {
if len(initLogicIDs) == 0 {
return nil
}
logicIDLock.Lock()
oldInitLogicIDs := initLogicIDs
initLogicIDs = []int32{}
logicIDLock.Unlock()
return oldInitLogicIDs
}
func setAlarmRaisedTime(device *NpuDevice) {
if len(device.FaultCodes) == 0 {
device.AlarmRaisedTime = 0
} else if device.AlarmRaisedTime == 0 {
device.AlarmRaisedTime = time.Now().UnixMilli()
}
}
func setNetworkAlarmRaisedTime(device *NpuDevice) {
if len(device.NetworkFaultCodes) == 0 {
device.NetworkAlarmRaisedTime = 0
} else if device.NetworkAlarmRaisedTime == 0 {
device.NetworkAlarmRaisedTime = time.Now().UnixMilli()
}
}
func SetNewFaultAndCacheOnceRecoverFault(logicID int32, chipFaultInfos []common.DevFaultInfo, device *NpuDevice,
curFaultCodesMap sets.Int64) {
if device == nil {
hwlog.RunLog.Error("param device is nil in SetNewFaultAndCacheOnceRecoverFault")
return
}
newChipFaultInfos := chipFaultInfos
if _, ok := faultDurationMap[HbmDoubleBitFaultCodeStr]; ok {
newChipFaultInfos = newFaultInfosForHBMErr(logicID, newChipFaultInfos)
}
steps := getChipFaultPreSteps(logicID, newChipFaultInfos)
if isA950CardType() {
steps = append(steps, getA950ChipFaultSteps(logicID, newChipFaultInfos, curFaultCodesMap, device)...)
} else {
steps = append(steps, getBaseChipFaultSteps(logicID, newChipFaultInfos, curFaultCodesMap, device)...)
}
steps = append(steps, getChipFaultPostSteps(device)...)
for _, step := range steps {
step.Do()
}
}
func SetNetworkNewFaultAndCacheOnceRecoverFault(logicID int32, networkFaultInfos []common.DevFaultInfo, device *NpuDevice) {
if device == nil {
hwlog.RunLog.Error("param device is nil in SetNetworkNewFaultAndCacheOnceRecoverFault")
return
}
steps := getParameterPlaneFaultPreSteps(logicID, networkFaultInfos)
if isA950CardType() {
steps = append(steps, getA950ParameterPlaneFaultSteps(logicID, networkFaultInfos, device)...)
} else {
steps = append(steps, getBaseParameterPlaneFaultSteps(logicID, networkFaultInfos, device)...)
}
steps = append(steps, getParameterPlaneFaultPostSteps(device)...)
for _, step := range steps {
step.Do()
}
}
func SetHyperPlaneNewFaultAndCacheOnceRecoverFault(logicID int32, hyperPlaneFaultInfos []common.DevFaultInfo, device *NpuDevice) {
if device == nil {
hwlog.RunLog.Error("param device is nil in SetHyperPlaneNewFaultAndCacheOnceRecoverFault")
return
}
steps := getHyperPlaneFaultPreSteps(logicID, hyperPlaneFaultInfos)
if isA950CardType() {
steps = append(steps, getA950HyperPlaneFaultSteps(logicID, hyperPlaneFaultInfos, device)...)
}
for _, step := range steps {
step.Do()
}
}
func SetHyperPlaneNewOverallFault(devices []*NpuDevice) {
for _, device := range devices {
if device == nil {
hwlog.RunLog.Error("param device is nil in SetHyperPlaneNewOverallFault")
return
}
}
steps := getHyperPlaneOverallFaultPreSteps(devices)
if isA950CardType() {
steps = append(steps, getA950HyperPlaneNewOverallFaultSteps(devices)...)
}
for _, step := range steps {
step.Do()
}
}
func baseChipFaultOccur(newFaultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range newFaultInfos {
if faultInfo.Assertion == common.FaultOccur || faultInfo.Assertion == common.FaultOnce {
device.FaultCodes = append(device.FaultCodes, faultInfo.EventID)
updateDeviceFaultTimeMap(device, faultInfo, true)
eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex))
if _, ok := faultDurationMap[eventIdStr]; !ok {
insertFrequencyFaultOccur(device.LogicID, faultInfo.EventID, faultInfo.AlarmRaisedTime)
}
}
}
}
func baseChipFaultRecover(
logicID int32,
newFaultInfos []common.DevFaultInfo,
curFaultCodesMap sets.Int64,
device *NpuDevice) {
for _, faultInfo := range newFaultInfos {
if faultInfo.Assertion == common.FaultRecover {
if curFaultCodesMap.Has(faultInfo.EventID) {
hwlog.RunLog.Infof("logicID(%d) curFaultCodesMap:%v contains fault code:%v, skip recover",
logicID, curFaultCodesMap, faultInfo.EventID)
continue
}
handleNpuFaultRecover(logicID, device, faultInfo)
}
if faultInfo.Assertion == common.FaultOnce {
recoverFaultMap[logicID] = append(recoverFaultMap[logicID], faultInfo.EventID)
}
}
}
func a950ChipFaultOccur(newFaultInfos []common.DevFaultInfo, device *NpuDevice) {
baseChipFaultOccur(newFaultInfos, device)
}
func a950ChipFaultRecover(
logicID int32,
newFaultInfos []common.DevFaultInfo,
curFaultCodesMap sets.Int64,
device *NpuDevice) {
baseChipFaultRecover(logicID, newFaultInfos, curFaultCodesMap, device)
}
func handleNpuFaultRecover(logicID int32, device *NpuDevice, faultInfo common.DevFaultInfo) {
if Int64Tool.Index(device.FaultCodes, faultInfo.EventID) == -1 {
recoverFaultMap[logicID] = append(recoverFaultMap[logicID], faultInfo.EventID)
} else {
device.FaultCodes = Int64Tool.Remove(device.FaultCodes, faultInfo.EventID)
updateDeviceFaultTimeMap(device, faultInfo, false)
eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex))
if _, ok := faultDurationMap[eventIdStr]; !ok {
insertFrequencyFaultRecover(device.LogicID, faultInfo.EventID, faultInfo.AlarmRaisedTime)
}
}
}
func updateDeviceFaultTimeMap(device *NpuDevice, faultInfo common.DevFaultInfo, isAdd bool) {
if device.FaultTimeMap == nil {
device.FaultTimeMap = make(map[int64]int64)
}
if isAdd {
faultTime := faultInfo.AlarmRaisedTime
if faultTime == 0 {
faultTime = time.Now().UnixMilli()
}
existingFaultTime, found := device.FaultTimeMap[faultInfo.EventID]
if !found || existingFaultTime > faultTime {
device.FaultTimeMap[faultInfo.EventID] = faultTime
}
hwlog.RunLog.Debugf("add logicId %d event %x fault time: %d",
device.LogicID, faultInfo.EventID, device.FaultTimeMap[faultInfo.EventID])
} else {
hwlog.RunLog.Debugf("del logicId %d event %x fault time: %d",
device.LogicID, faultInfo.EventID, device.FaultTimeMap[faultInfo.EventID])
delete(device.FaultTimeMap, faultInfo.EventID)
}
}
func newFaultInfosForHBMErr(logicID int32, faultInfos []common.DevFaultInfo) []common.DevFaultInfo {
var newFaultInfos []common.DevFaultInfo
for i := 0; i < len(faultInfos); i++ {
if faultInfos[i].EventID == HbmDoubleBitFaultCode && faultInfos[i].Assertion != common.FaultRecover {
hbmTool.updateHbmOccurTime(faultInfos[i])
}
if faultInfos[i].EventID == AicBusFaultCode || faultInfos[i].EventID == AivBusFaultCode {
hbmTool.aicFaultEventInQue(faultInfos[i])
continue
}
newFaultInfos = append(newFaultInfos, faultInfos[i])
}
return append(newFaultInfos, hbmTool.aicFaultEventOutQue(logicID)...)
}
func baseParameterPlaneFaultRecover(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range faultInfos {
if faultInfo.Assertion == common.FaultRecover {
if Int64Tool.Index(device.NetworkFaultCodes, faultInfo.EventID) == -1 {
recoverNetworkFaultMap[logicID] = append(recoverNetworkFaultMap[logicID], faultInfo.EventID)
} else {
handleNetworkFaultRecover(device, faultInfo)
}
}
if faultInfo.Assertion == common.FaultOnce {
recoverNetworkFaultMap[logicID] = append(recoverNetworkFaultMap[logicID], faultInfo.EventID)
}
}
}
func handleNetworkFaultRecover(device *NpuDevice, faultInfo common.DevFaultInfo) {
device.NetworkFaultCodes = Int64Tool.Remove(device.NetworkFaultCodes, faultInfo.EventID)
updateDeviceFaultTimeMap(device, faultInfo, false)
eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex))
if _, ok := faultDurationMap[eventIdStr]; !ok {
insertFrequencyFaultRecover(device.LogicID, faultInfo.EventID, faultInfo.AlarmRaisedTime)
}
}
func a950ParameterPlaneFaultRecover(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range faultInfos {
if faultInfo.Assertion == common.FaultRecover {
err := cacheUBports(logicID, device)
if err != nil {
hwlog.RunLog.Errorf("logicID(%d) cacheUBports failed, err: %v", logicID, err)
continue
}
handleA950NetworkFaultRecover(logicID, device, faultInfo)
downCnt := getUBOEDownCnt(device)
if downCnt != common.PortNoDownCount {
tmpFaultInfo := faultInfo
tmpFaultInfo.Assertion = common.FaultOccur
a950ParameterPlaneFaultOccur(logicID, []common.DevFaultInfo{tmpFaultInfo}, device)
}
}
if faultInfo.Assertion == common.FaultOnce {
recoverNetworkFaultMap[logicID] = append(recoverNetworkFaultMap[logicID], faultInfo.EventID)
}
}
}
func handleA950NetworkFaultRecover(logicID int32, device *NpuDevice, faultInfo common.DevFaultInfo) {
if Int64Tool.Index(device.NetworkFaultCodes, faultInfo.EventID) == -1 {
recoverNetworkFaultMap[logicID] = append(recoverNetworkFaultMap[logicID], faultInfo.EventID)
} else {
preciseFaultCodesSet, ok := UBOEPreciseFaultCodesMap[faultInfo.EventID]
if !ok {
hwlog.RunLog.Errorf("logicID(%d) UBOEPreciseFaultCodesMap not found preciseFaultCode(%x)",
logicID, faultInfo.EventID)
return
}
for preciseFaultCode := range preciseFaultCodesSet {
tmpFaultInfo := faultInfo
tmpFaultInfo.EventID = preciseFaultCode
updateDeviceFaultTimeMap(device, tmpFaultInfo, false)
if Int64Tool.Contains(device.NetworkFaultCodes, preciseFaultCode) {
device.NetworkFaultCodes = Int64Tool.Remove(device.NetworkFaultCodes, preciseFaultCode)
}
}
}
}
func baseParameterPlaneFaultOccur(faultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range faultInfos {
if faultInfo.Assertion == common.FaultOccur || faultInfo.Assertion == common.FaultOnce {
device.NetworkFaultCodes = append(device.NetworkFaultCodes, faultInfo.EventID)
updateDeviceFaultTimeMap(device, faultInfo, true)
eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex))
if _, ok := faultDurationMap[eventIdStr]; !ok {
insertFrequencyFaultOccur(device.LogicID, faultInfo.EventID, faultInfo.AlarmRaisedTime)
}
}
}
}
func a950ParameterPlaneFaultOccur(logicID int32, faultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range faultInfos {
if faultInfo.Assertion == common.FaultOccur || faultInfo.Assertion == common.FaultOnce {
err := cacheUBports(logicID, device)
if err != nil {
hwlog.RunLog.Errorf("logicID(%d) cacheUBports failed, err: %v", logicID, err)
continue
}
downCnt := getUBOEDownCnt(device)
if downCnt == common.PortNoDownCount {
return
}
preciseFaultMap, ok := common.ParameterPlaneDownProtsNumToPreciseFaultCodeMap[ParamOption.RealCardType]
if !ok {
hwlog.RunLog.Errorf("not found preciseFaultMap for device type: %s", ParamOption.RealCardType)
continue
}
preciseFaultCode, ok := preciseFaultMap[downCnt]
if !ok {
hwlog.RunLog.Errorf("not found preciseFaultCode for downCnt: %d", downCnt)
continue
}
tmpFaultInfo := faultInfo
tmpFaultInfo.EventID = preciseFaultCode
updateDeviceFaultTimeMap(device, tmpFaultInfo, true)
device.NetworkFaultCodes = append(device.NetworkFaultCodes, preciseFaultCode)
updateDeviceFaultTimeMap(device, faultInfo, true)
device.NetworkFaultCodes = append(device.NetworkFaultCodes, faultInfo.EventID)
}
}
}
func a950HyperPlaneFaultRecover(logicID int32, hyperPlaneFaultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range hyperPlaneFaultInfos {
if faultInfo.Assertion == common.FaultRecover {
err := cacheUBports(logicID, device)
if err != nil {
hwlog.RunLog.Errorf("logicID(%d) cacheUBports failed, err: %v", logicID, err)
continue
}
handleA950HyperPlaneFaultRecover(logicID, device, faultInfo)
downCnt := getUBDownCnt(device)
if downCnt != common.PortNoDownCount {
tmpFaultInfo := faultInfo
tmpFaultInfo.Assertion = common.FaultOccur
a950HyperPlaneFaultOccur(logicID, []common.DevFaultInfo{tmpFaultInfo}, device)
}
}
if faultInfo.Assertion == common.FaultOnce {
recoverFaultMap[logicID] = append(recoverFaultMap[logicID], faultInfo.EventID)
}
}
}
func cacheUBports(logicID int32, device *NpuDevice) error {
if device.UBports == nil {
UBports, err := hccn.GetAllUBports(logicID)
if err != nil {
return fmt.Errorf("logicID(%d) GetAllUBports failed, err: %v", logicID, err)
}
device.UBports = UBports
}
return nil
}
func getUBOEDownCnt(device *NpuDevice) int {
downCnt := 0
for _, ubPort := range device.UBports {
if ubPort.PortType == hccn.BondingPortName && ubPort.LinkStatus == hccn.LinkDown {
downCnt++
}
}
return downCnt
}
func getUBDownCnt(device *NpuDevice) int {
downCnt := 0
for _, ubPort := range device.UBports {
if ubPort.PortType == hccn.UBPortName && ubPort.LinkStatus == hccn.LinkDown {
downCnt++
}
}
return downCnt
}
func handleA950HyperPlaneFaultRecover(logicID int32, device *NpuDevice, faultInfo common.DevFaultInfo) {
if Int64Tool.Index(device.FaultCodes, faultInfo.EventID) == -1 {
recoverFaultMap[logicID] = append(recoverFaultMap[logicID], faultInfo.EventID)
} else {
preciseFaultCodesSet, ok := UBPreciseFaultCodesMap[faultInfo.EventID]
if !ok {
hwlog.RunLog.Errorf("logicID(%d) UBPreciseFaultCodesMap not found preciseFaultCode(%x)",
logicID, faultInfo.EventID)
return
}
for preciseFaultCode := range preciseFaultCodesSet {
if Int64Tool.Contains(device.FaultCodes, preciseFaultCode) {
tmpFaultInfo := faultInfo
tmpFaultInfo.EventID = preciseFaultCode
updateDeviceFaultTimeMap(device, tmpFaultInfo, false)
device.FaultCodes = Int64Tool.Remove(device.FaultCodes, preciseFaultCode)
}
}
}
}
func a950HyperPlaneFaultOccur(logicID int32, hyperPlaneFaultInfos []common.DevFaultInfo, device *NpuDevice) {
for _, faultInfo := range hyperPlaneFaultInfos {
if faultInfo.Assertion == common.FaultOccur || faultInfo.Assertion == common.FaultOnce {
err := cacheUBports(logicID, device)
if err != nil {
hwlog.RunLog.Errorf("logicID(%d) cacheUBports failed, err: %v", logicID, err)
continue
}
downCnt := getUBDownCnt(device)
if downCnt == common.PortNoDownCount {
return
}
preciseFaultCode := UBSeparateFaultCode
tmpFaultInfo := faultInfo
tmpFaultInfo.EventID = int64(preciseFaultCode)
updateDeviceFaultTimeMap(device, tmpFaultInfo, true)
device.FaultCodes = append(device.FaultCodes, (int64)(preciseFaultCode))
updateDeviceFaultTimeMap(device, faultInfo, true)
device.FaultCodes = append(device.FaultCodes, faultInfo.EventID)
}
}
}
func a950HyperPlaneNewOverallFaultModify(devices []*NpuDevice) {
allHaveHyperPlaneFaultCode := true
for _, device := range devices {
if !Int64Tool.Contains(device.FaultCodes, UBPortDownCode) {
allHaveHyperPlaneFaultCode = false
}
}
if allHaveHyperPlaneFaultCode {
curTime := time.Now().Unix()
for _, device := range devices {
if Int64Tool.Contains(device.FaultCodes, UBSeparateFaultCode) {
device.FaultCodes = Int64Tool.Remove(device.FaultCodes, UBSeparateFaultCode)
tmpFaultInfo := common.DevFaultInfo{
EventID: UBSeparateFaultCode,
AlarmRaisedTime: curTime,
}
updateDeviceFaultTimeMap(device, tmpFaultInfo, false)
}
if !Int64Tool.Contains(device.FaultCodes, UBSubHealFaultCode) {
tmpFaultInfo := common.DevFaultInfo{
EventID: UBSubHealFaultCode,
AlarmRaisedTime: curTime,
}
updateDeviceFaultTimeMap(device, tmpFaultInfo, true)
device.FaultCodes = append(device.FaultCodes, UBSubHealFaultCode)
}
}
}
}
func DelOnceRecoverFault(groupDevice map[string][]*NpuDevice) {
for _, devices := range groupDevice {
for _, device := range devices {
recoverFaults := recoverFaultMap[device.LogicID]
for _, recoverFault := range recoverFaults {
device.FaultCodes = Int64Tool.Remove(device.FaultCodes, recoverFault)
delOnceRecoverFaultTime(device, recoverFault)
}
setAlarmRaisedTime(device)
recoverNetworkFaults := recoverNetworkFaultMap[device.LogicID]
for _, recoverNetworkFault := range recoverNetworkFaults {
device.NetworkFaultCodes = Int64Tool.Remove(device.NetworkFaultCodes, recoverNetworkFault)
delOnceRecoverFaultTime(device, recoverNetworkFault)
}
setNetworkAlarmRaisedTime(device)
}
}
recoverFaultMap = make(map[int32][]int64, GeneralMapSize)
recoverNetworkFaultMap = make(map[int32][]int64, GeneralMapSize)
}
func ClearUBportsInfo(groupDevice map[string][]*NpuDevice) {
for _, devices := range groupDevice {
for _, device := range devices {
device.UBports = nil
}
}
}
func delOnceRecoverFaultTime(device *NpuDevice, eventId int64) {
hexFaultCode := strings.ToUpper(strconv.FormatInt(eventId, Hex))
hwlog.RunLog.Debugf("delete fault %s with time: %d", hexFaultCode, device.FaultTimeMap[eventId])
delete(device.FaultTimeMap, eventId)
}
func DelOnceFrequencyFault() {
for logicId, eventId := range recoverFaultFrequencyMap {
frequencyCache, ok := faultFrequencyMap[eventId]
if !ok {
hwlog.RunLog.Warnf("eventId %v is not exist in faultFrequencyMap %v", eventId, faultFrequencyMap)
return
}
frequencyCache.Frequency[logicId] = make([]int64, 0, frequencyCache.Times)
hwlog.RunLog.Infof("logic id %v frequency cache is successfully cleared", logicId)
}
recoverFaultFrequencyMap = make(map[int32]string, GeneralMapSize)
}
func DoSaveDevFaultInfo(devFaultInfo common.DevFaultInfo, enableDelay bool) {
if !limiter.Allow() {
hwlog.RunLog.Warnf("fault callback rate limit overflowed, current fault: %#v will be discard", devFaultInfo)
hwlog.RunLog.Warnf("will set current device: %v into init status", devFaultInfo.LogicID)
SetDeviceInit(devFaultInfo.LogicID)
return
}
defer func() {
TriggerUpdate("A fault has occurred")
}()
hwlog.RunLog.Infof("receive devFaultInfo: %#v, hex code: %v", devFaultInfo,
strconv.FormatInt(devFaultInfo.EventID, Hex))
if devFaultInfo.EventID == 0 {
return
}
if devFaultInfo.EventID == ResetFinishFaultCode {
SetDeviceInit(devFaultInfo.LogicID)
return
}
faultSeverityMap[devFaultInfo.EventID] = devFaultInfo.Severity
if devFaultInfo.Assertion == common.FaultRecover && enableDelay {
hwlog.RunLog.Debugf("save recover fault info should delay 1s")
time.Sleep(time.Second)
}
devFaultInfoMapLock.Lock()
devFaultInfoMap[devFaultInfo.LogicID] = append(devFaultInfoMap[devFaultInfo.LogicID], devFaultInfo)
devFaultInfoMapLock.Unlock()
}
func SaveDevFaultInfo(devFaultInfo common.DevFaultInfo) {
go DoSaveDevFaultInfo(devFaultInfo, true)
}
func GetAndCleanFaultInfo() map[int32][]common.DevFaultInfo {
if len(devFaultInfoMap) == 0 {
return map[int32][]common.DevFaultInfo{}
}
devFaultInfoMapLock.Lock()
oldDevFaultInfoMap := devFaultInfoMap
devFaultInfoMap = make(map[int32][]common.DevFaultInfo, GeneralMapSize)
devFaultInfoMapLock.Unlock()
return oldDevFaultInfoMap
}
func SaveManuallyFaultInfo(logicID int32) {
if logicID < MinLogicID || logicID > MaxLogicID {
hwlog.RunLog.Warnf("logic id %d is not valid, logic id must be in [0, 15]", logicID)
return
}
manFaultInfo := ManuallyFaultInfo{
LogicID: logicID,
FirstHandle: true,
RecordTime: time.Now().UnixMilli(),
}
manuallySeparateNpuMapLock.Lock()
defer manuallySeparateNpuMapLock.Unlock()
manuallySeparateNpuMap[logicID] = manFaultInfo
hwlog.RunLog.Debugf("received manually fault info, manually separate npu logic id: %d, first handle: %v, "+
"manually separate device cache is: %v", manFaultInfo.LogicID, manFaultInfo.FirstHandle, manuallySeparateNpuMap)
}
func QueryManuallyFaultInfoByLogicID(logicID int32) bool {
if logicID < MinLogicID || logicID > MaxLogicID {
hwlog.RunLog.Warnf("logic id %d is invalid, logic id must be in [0, 15]", logicID)
return false
}
manuallySeparateNpuMapLock.Lock()
_, ok := manuallySeparateNpuMap[logicID]
manuallySeparateNpuMapLock.Unlock()
return ok
}
func QueryManuallyFaultNPULogicIDsByHandleStatus(handleStatus string) []int32 {
logicIDs := make([]int32, 0, GeneralMapSize)
if handleStatus != ManuallySeparateNpuFirstHandle && handleStatus != ManuallySeparateNpuHandled &&
handleStatus != ManuallySeparateNpuAll {
hwlog.RunLog.Warnf("manually fault npu handle status %v is invalid, it must be in [%v,%v,%v]", handleStatus,
ManuallySeparateNpuFirstHandle, ManuallySeparateNpuHandled, ManuallySeparateNpuAll)
return logicIDs
}
manuallySeparateNpuMapLock.Lock()
defer manuallySeparateNpuMapLock.Unlock()
switch {
case handleStatus == ManuallySeparateNpuFirstHandle:
for _, manuallySeparateNpu := range manuallySeparateNpuMap {
if manuallySeparateNpu.FirstHandle {
logicIDs = append(logicIDs, manuallySeparateNpu.LogicID)
}
}
break
case handleStatus == ManuallySeparateNpuHandled:
for _, manuallySeparateNpu := range manuallySeparateNpuMap {
if !manuallySeparateNpu.FirstHandle {
logicIDs = append(logicIDs, manuallySeparateNpu.LogicID)
}
}
break
default:
for _, manuallySeparateNpu := range manuallySeparateNpuMap {
logicIDs = append(logicIDs, manuallySeparateNpu.LogicID)
}
}
return logicIDs
}
func SetManuallyFaultNPUHandled() {
manuallySeparateNpuMapLock.Lock()
defer manuallySeparateNpuMapLock.Unlock()
for logicId, manuallyFaultInfo := range manuallySeparateNpuMap {
manuallyFaultInfo.FirstHandle = false
manuallySeparateNpuMap[logicId] = manuallyFaultInfo
}
}
func DeleteManuallyFaultInfo(logicID int32) {
if logicID < MinLogicID || logicID > MaxLogicID {
hwlog.RunLog.Warnf("logic id %d not valid, must be in [0, 15]", logicID)
return
}
manuallySeparateNpuMapLock.Lock()
defer manuallySeparateNpuMapLock.Unlock()
if deleteManuallySeparateFaultInfo, ok := manuallySeparateNpuMap[logicID]; ok {
delete(manuallySeparateNpuMap, logicID)
hwlog.RunLog.Infof("device logic id %v, manually fault info %v has been removed, manually separate device "+
"cache: %v", logicID, deleteManuallySeparateFaultInfo, manuallySeparateNpuMap)
} else {
hwlog.RunLog.Debugf("device logic id %v manually fault info not exist, no need to remove", logicID)
}
}
func CountFaultDuration(device *NpuDevice, devFaultInfoMap map[int32][]common.DevFaultInfo) {
if device == nil {
return
}
faultDurationMapLock.Lock()
defer faultDurationMapLock.Unlock()
collectEachFaultEvent(device.LogicID, devFaultInfoMap[device.LogicID])
for eventId, _ := range faultDurationMap {
sortFaultEventsInAscendingOrder(device.LogicID, eventId)
cleanFaultQueue(device.LogicID, eventId)
handleFaultQueue(device.LogicID, eventId)
}
}
func collectEachFaultEvent(logicId int32, faultInfos []common.DevFaultInfo) {
for _, faultInfo := range faultInfos {
eventIdStr := strings.ToLower(strconv.FormatInt(faultInfo.EventID, Hex))
if _, ok := faultDurationMap[eventIdStr]; !ok {
continue
}
if faultDurationMap[eventIdStr].Duration == nil {
faultDurationMap[eventIdStr].Duration = make(map[int32]FaultDurationData, GeneralMapSize)
}
if _, ok := faultDurationMap[eventIdStr].Duration[logicId]; !ok {
faultDurationMap[eventIdStr].Duration[logicId] = FaultDurationData{
FaultEventQueue: []common.DevFaultInfo{},
}
}
faultDurationData := faultDurationMap[eventIdStr].Duration[logicId]
faultDurationData.FaultEventQueue = append(faultDurationData.FaultEventQueue, faultInfo)
faultDurationMap[eventIdStr].Duration[logicId] = faultDurationData
}
}
func sortFaultEventsInAscendingOrder(logicID int32, eventId string) {
if _, ok := faultDurationMap[eventId]; !ok {
return
}
if _, ok := faultDurationMap[eventId].Duration[logicID]; !ok {
return
}
faultQueue := faultDurationMap[eventId].Duration[logicID].FaultEventQueue
sort.Sort(DevFaultInfoBasedTimeAscend(faultQueue))
}
func cleanFaultQueue(logicID int32, eventId string) {
if _, ok := faultDurationMap[eventId]; !ok {
return
}
if _, ok := faultDurationMap[eventId].Duration[logicID]; !ok {
return
}
faultDurationData := faultDurationMap[eventId].Duration[logicID]
mergeContinuousElementBasedAssertion(&faultDurationData.FaultEventQueue)
clearFirstEventBasedOnFaultStatus(&faultDurationData)
faultDurationMap[eventId].Duration[logicID] = faultDurationData
hwlog.RunLog.Debugf("NPU logic id: %d, %s fault timeout status: %v, fault queue after sort and merge: %v",
logicID, eventId, faultDurationMap[eventId].Duration[logicID].TimeoutStatus,
faultDurationMap[eventId].Duration[logicID].FaultEventQueue)
}
func mergeContinuousElementBasedAssertion(devFaultInfo *[]common.DevFaultInfo) {
if devFaultInfo == nil || len(*devFaultInfo) == 0 {
return
}
previousEvent := (*devFaultInfo)[0]
newDevFaultInfo := []common.DevFaultInfo{previousEvent}
for i := 1; i < len(*devFaultInfo); i++ {
currentEvent := (*devFaultInfo)[i]
if currentEvent.Assertion == previousEvent.Assertion {
continue
}
previousEvent = currentEvent
newDevFaultInfo = append(newDevFaultInfo, currentEvent)
}
*devFaultInfo = newDevFaultInfo
}
func clearFirstEventBasedOnFaultStatus(faultDurationData *FaultDurationData) {
if !faultDurationData.TimeoutStatus && len(faultDurationData.FaultEventQueue) > 0 &&
faultDurationData.FaultEventQueue[0].Assertion == common.FaultRecover {
faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[1:]
}
if faultDurationData.TimeoutStatus && len(faultDurationData.FaultEventQueue) > 0 &&
faultDurationData.FaultEventQueue[0].Assertion == common.FaultOccur {
faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[1:]
}
}
func handleFaultQueue(logicID int32, eventId string) {
if _, ok := faultDurationMap[eventId]; !ok {
return
}
if _, ok := faultDurationMap[eventId].Duration[logicID]; !ok {
return
}
faultDurationData := faultDurationMap[eventId].Duration[logicID]
if len(faultDurationData.FaultEventQueue) == 0 {
hwlog.RunLog.Debugf("NPU logic id: %v, %v fault queue is empty, no need to handle fault queue",
logicID, eventId)
return
}
initTimeoutStatus := faultDurationData.TimeoutStatus
exitTag := false
for !exitTag {
faultDurationData = faultDurationMap[eventId].Duration[logicID]
exitTag = timeoutOrRecoveryAlgorithm(logicID, eventId, !faultDurationData.TimeoutStatus)
}
faultDurationData = faultDurationMap[eventId].Duration[logicID]
hwlog.RunLog.Debugf("NPU logic id: %v, after timeout or recovery algorithm handling, %v fault timeout "+
"status is %v, fault duration time is %.2f seconds, fault recover duration time is %.2f seconds, "+
"fault queue is %v", logicID, eventId, faultDurationData.TimeoutStatus,
float64(faultDurationData.FaultDurationTime)/SecondMagnificationFloat,
float64(faultDurationData.FaultRecoverDurationTime)/SecondMagnificationFloat,
faultDurationData.FaultEventQueue)
num, err := strconv.ParseInt(eventId, Hex, 0)
if err != nil {
hwlog.RunLog.Errorf(parseHexFailedMsg, eventId)
return
}
if initTimeoutStatus == false && faultDurationData.TimeoutStatus == true {
insertFrequencyFaultOccur(logicID, num, faultDurationData.FaultAlarmTime)
}
if initTimeoutStatus == true && faultDurationData.TimeoutStatus == false {
insertFrequencyFaultRecover(logicID, num, faultDurationData.FaultAlarmTime)
}
var duration int64
if faultDurationData.TimeoutStatus {
duration = faultDurationData.FaultDurationTime
} else {
duration = faultDurationData.FaultRecoverDurationTime
}
if initTimeoutStatus != faultDurationData.TimeoutStatus {
hwlog.RunLog.Infof("NPU logic id: %v, after timeout or recovery algorithm handling, %v fault timeout "+
"status change, now fault timeout status set %v, duration time is %.2f seconds",
logicID, eventId, faultDurationData.TimeoutStatus, float64(duration)/SecondMagnificationFloat)
}
}
func timeoutOrRecoveryAlgorithm(logicID int32, eventId string, timeoutStatus bool) bool {
process := getProcessInFaultDuration(timeoutStatus)
faultQueueLen := len(faultDurationMap[eventId].Duration[logicID].FaultEventQueue)
if faultQueueLen == 0 {
hwlog.RunLog.Debugf("NPU logic id: %v, %v fault queue is empty, no need to do %v judgment", logicID,
eventId, process)
return true
}
var i int
var duration int64
timeoutThreshold := getTimeoutThreshold(eventId, timeoutStatus)
faultTimeoutMsg := "NPU logic id: %v, in %v judgment, %v duration is %.2f seconds > %v seconds, %v fault " +
"timeout status set %v"
faultNotTimeoutMsg := "NPU logic id: %v, in %v judgment, %v duration is %.2f seconds <= %v seconds, %v " +
"fault timeout status %v doesn't need to change, continue to perform %v judgment"
for i = 0; i < faultQueueLen/halfDivisor; i++ {
faultDurationData := faultDurationMap[eventId].Duration[logicID]
preAlarmTime := faultDurationData.FaultEventQueue[i*halfDivisor].AlarmRaisedTime
nextAlarmTime := faultDurationData.FaultEventQueue[i*halfDivisor+1].AlarmRaisedTime
duration = nextAlarmTime - preAlarmTime
if duration <= timeoutThreshold*SecondMagnification {
continue
}
hwlog.RunLog.Debugf(faultTimeoutMsg, logicID, process, process, float64(duration)/SecondMagnificationFloat,
timeoutThreshold, eventId, timeoutStatus)
return handleTimeoutCondition(handleDurationInputPara{logicID: logicID, eventId: eventId, index: i,
timeoutStatus: timeoutStatus, duration: duration, faultAlarmTime: preAlarmTime + timeoutThreshold*SecondMagnification})
}
if i*halfDivisor+1 == faultQueueLen {
faultDurationData := faultDurationMap[eventId].Duration[logicID]
currentHostTime := time.Now().UnixMilli()
lastAlarmTime := faultDurationData.FaultEventQueue[i*halfDivisor].AlarmRaisedTime
duration = currentHostTime - lastAlarmTime
if duration <= timeoutThreshold*SecondMagnification {
hwlog.RunLog.Debugf(faultNotTimeoutMsg, logicID, process, process, float64(duration)/
SecondMagnificationFloat, timeoutThreshold, eventId, faultDurationData.TimeoutStatus, process)
return handleNotTimeoutCondition(handleDurationInputPara{logicID: logicID, eventId: eventId, index: i,
timeoutStatus: timeoutStatus, duration: duration})
}
hwlog.RunLog.Debugf(faultTimeoutMsg, logicID, process, process, float64(duration)/SecondMagnificationFloat,
timeoutThreshold, eventId, timeoutStatus)
return handleTimeoutCondition(handleDurationInputPara{logicID: logicID, eventId: eventId, index: i,
timeoutStatus: timeoutStatus, duration: duration, faultAlarmTime: lastAlarmTime + timeoutThreshold*SecondMagnification})
}
if halfDivisor*i == faultQueueLen {
hwlog.RunLog.Debugf(faultNotTimeoutMsg, logicID, process, process, float64(duration)/SecondMagnificationFloat,
timeoutThreshold, eventId, faultDurationMap[eventId].Duration[logicID].TimeoutStatus, process)
return handleNotTimeoutCondition(handleDurationInputPara{logicID: logicID, eventId: eventId, index: i,
timeoutStatus: timeoutStatus, duration: duration})
}
return true
}
func getProcessInFaultDuration(timeoutStatus bool) string {
if timeoutStatus {
return TimeoutProcess
}
return TimeoutRecoverProcess
}
func getTimeoutThreshold(eventId string, timeoutStatus bool) int64 {
if _, ok := faultDurationMap[eventId]; !ok {
return MinFaultTimeout
}
if timeoutStatus {
return faultDurationMap[eventId].FaultDuration.FaultTimeout
}
return faultDurationMap[eventId].FaultDuration.RecoverTimeout
}
func handleTimeoutCondition(inputPara handleDurationInputPara) bool {
faultDurationData := faultDurationMap[inputPara.eventId].Duration[inputPara.logicID]
faultDurationData.TimeoutStatus = inputPara.timeoutStatus
faultQueueMsg := "NPU logic id: %v, %v fault queue: %v"
if inputPara.timeoutStatus {
faultDurationData.FaultDurationTime = inputPara.duration
faultDurationData.FaultAlarmTime = inputPara.faultAlarmTime
faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] = faultDurationData
hwlog.RunLog.Infof(faultQueueMsg, inputPara.logicID, inputPara.eventId, faultDurationData.FaultEventQueue)
return true
}
faultDurationData.FaultRecoverDurationTime = inputPara.duration
faultDurationData.FaultAlarmTime = inputPara.faultAlarmTime
faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[halfDivisor*inputPara.index+1:]
faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] = faultDurationData
hwlog.RunLog.Infof(faultQueueMsg, inputPara.logicID, inputPara.eventId, faultDurationData.FaultEventQueue)
return false
}
func handleNotTimeoutCondition(inputPara handleDurationInputPara) bool {
faultDurationData := faultDurationMap[inputPara.eventId].Duration[inputPara.logicID]
if inputPara.timeoutStatus {
faultDurationData.FaultDurationTime = inputPara.duration
} else {
faultDurationData.FaultRecoverDurationTime = inputPara.duration
}
faultDurationData.FaultEventQueue = faultDurationData.FaultEventQueue[halfDivisor*inputPara.index:]
faultDurationMap[inputPara.eventId].Duration[inputPara.logicID] = faultDurationData
hwlog.RunLog.Debugf("NPU logic id: %v, %v fault queue: %v", inputPara.logicID, inputPara.eventId,
faultDurationData.FaultEventQueue)
return true
}
func GetFaultAssertionName(assertion int8) string {
switch assertion {
case common.FaultRecover:
return AssertionRecovery
case common.FaultOccur:
return AssertionOccur
case common.FaultOnce:
return AssertionNotice
default:
return ""
}
}
func GetChangedDevFaultInfo(device *NpuDevice, oldErrCodes []int64, newErrCodes []int64) []common.DevFaultInfo {
devFaultInfo := make([]common.DevFaultInfo, 0, len(newErrCodes))
if device == nil {
return devFaultInfo
}
for _, newCode := range newErrCodes {
if Int64Tool.Index(oldErrCodes, newCode) == -1 {
faultInfo := common.DevFaultInfo{
EventID: newCode,
LogicID: device.LogicID,
Assertion: common.FaultOccur,
AlarmRaisedTime: time.Now().UnixMilli(),
}
devFaultInfo = append(devFaultInfo, faultInfo)
}
}
for _, oldCode := range oldErrCodes {
if Int64Tool.Index(newErrCodes, oldCode) == -1 {
faultInfo := common.DevFaultInfo{
EventID: oldCode,
LogicID: device.LogicID,
Assertion: common.FaultRecover,
AlarmRaisedTime: time.Now().UnixMilli(),
}
devFaultInfo = append(devFaultInfo, faultInfo)
}
}
return devFaultInfo
}
func CheckErrorMessage(err error, target string) bool {
return err != nil && strings.Contains(err.Error(), target)
}
func GetTimeoutFaultLevelAndCodes(mode string, logicId int32) map[int64]FaultTimeAndLevel {
result := make(map[int64]FaultTimeAndLevel)
if mode != ChipFaultMode && mode != NetworkFaultMode {
return result
}
faultDurationMapLock.Lock()
defer faultDurationMapLock.Unlock()
for eventId, faultDurationCache := range faultDurationMap {
num, err := strconv.ParseInt(eventId, Hex, 0)
if err != nil {
hwlog.RunLog.Errorf(parseHexFailedMsg, eventId)
continue
}
if (mode == ChipFaultMode && NetworkFaultCodes.Has(num)) ||
(mode == NetworkFaultMode && !NetworkFaultCodes.Has(num)) {
continue
}
if faultDurationCache.Duration[logicId].TimeoutStatus {
result[num] = FaultTimeAndLevel{
FaultTime: faultDurationCache.Duration[logicId].FaultAlarmTime,
FaultLevel: faultDurationCache.FaultHandling,
}
}
}
return result
}
func GetFrequencyFaultLevelAndCodes(mode string, logicId int32) map[int64]FaultTimeAndLevel {
result := make(map[int64]FaultTimeAndLevel)
if mode != ChipFaultMode && mode != NetworkFaultMode {
return result
}
faultFrequencyMapLock.Lock()
defer faultFrequencyMapLock.Unlock()
for eventId, faultFrequencyCache := range faultFrequencyMap {
num, err := strconv.ParseInt(eventId, Hex, 0)
if err != nil {
hwlog.RunLog.Errorf(parseHexFailedMsg, eventId)
continue
}
if (mode == ChipFaultMode && NetworkFaultCodes.Has(num)) ||
(mode == NetworkFaultMode && !NetworkFaultCodes.Has(num)) {
continue
}
faultOccurLen := len(faultFrequencyCache.Frequency[logicId])
if int64(faultOccurLen) >= faultFrequencyCache.Times && faultOccurLen > 0 {
result[num] = FaultTimeAndLevel{
FaultTime: faultFrequencyCache.Frequency[logicId][faultOccurLen-1],
FaultLevel: faultFrequencyCache.FaultHandling,
}
}
}
return result
}
func GetUpgradeFaultLevelAndTime(logicId int32, mode string) map[int64]FaultTimeAndLevel {
upgradeReasonSet := copyUpgradeFaultCacheFromLogic(LogicId(logicId))
result := make(map[int64]FaultTimeAndLevel)
if mode != ChipFaultMode && mode != NetworkFaultMode && mode != AllFaultMode {
return result
}
for _, value := range upgradeReasonSet {
num, err := strconv.ParseInt(value.FaultCode, Hex, 0)
if err != nil {
hwlog.RunLog.Errorf(parseHexFailedMsg, value.FaultCode)
continue
}
if mode == NetworkFaultMode && !NetworkFaultCodes.Has(num) {
continue
}
if mode == ChipFaultMode && NetworkFaultCodes.Has(num) {
continue
}
result[num] = FaultTimeAndLevel{
FaultTime: value.UpgradeTime,
FaultLevel: value.FaultLevel,
}
}
return result
}