Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package fault
import (
"fmt"
"net"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
"ascend-common/common-utils/hwlog"
"ascend-common/common-utils/utils"
"github.com/Mellanox/k8s-rdma-shared-dev-plugin/pkg/resources/common"
util "github.com/Mellanox/k8s-rdma-shared-dev-plugin/pkg/utils"
)
var (
faultTimeCache = make(map[string]int64)
faultTimeCacheMu sync.Mutex
)
type HcaInfo struct {
Hca string
State string
PhysState string
}
type FaultDetail struct {
FaultCode string `json:"FaultCode"`
Time int64 `json:"Time"`
Description string `json:"Description"`
FaultLevel string `json:"FaultLevel"`
}
type DPUItem struct {
HcaName string `json:"HcaName"`
EthName string `json:"EthName"`
IpAddr string `json:"IpAddr,omitempty"`
DeviceID string `json:"DeviceID"`
VendorID string `json:"VendorID"`
FaultList []FaultDetail `json:"FaultList"`
}
type NodeEvent struct {
NodeName string `json:"NodeName"`
FaultList []FaultDetail `json:"FaultList"`
}
type hcaBasicInfo struct {
DeviceID string
VendorID string
EthName string
IpAddr string
}
type DpuInfoCfg struct {
DPUInfo struct {
DPUList []DPUItem `json:"DPUList"`
NodeEvent *NodeEvent `json:"NodeEvent"`
} `json:"DPUInfo"`
UpdateTime time.Time `json:"UpdateTime"`
}
func GetHcaDeviceID(hca string) string {
deviceID := ReadFile(filepath.Join(common.SysClassInfiniband, hca, "device/device"))
if deviceID != "" && !strings.HasPrefix(deviceID, "0x") {
deviceID = "0x" + deviceID
}
return deviceID
}
func GetHcaVendor(hca string) string {
vendor := ReadFile(filepath.Join(common.SysClassInfiniband, hca, "device/vendor"))
if vendor != "" && !strings.HasPrefix(vendor, "0x") {
vendor = "0x" + vendor
}
return vendor
}
func getCurrentTimeMs() int64 {
return time.Now().UnixMilli()
}
func GetHcaEthName(hca string) string {
if ethName := getEthNameFromInfiniband(hca); ethName != "" {
return ethName
}
entries, err := os.ReadDir(common.SysBusUb)
if err != nil {
hwlog.RunLog.Errorf("Error reading UB devices directory %s: %v", common.SysBusUb, err)
return ""
}
for _, entry := range entries {
ubID := entry.Name()
infinibandDir := filepath.Join(common.SysBusUb, ubID, "infiniband")
ibEntries, err := os.ReadDir(infinibandDir)
if err != nil {
continue
}
for _, ibEntry := range ibEntries {
if ibEntry.Name() == hca {
netDir := filepath.Join(common.SysBusUb, ubID, "net")
netEntries, err := os.ReadDir(netDir)
if err != nil {
hwlog.RunLog.Errorf("Error reading net directory %s: %v", netDir, err)
return ""
}
if len(netEntries) > 0 {
return netEntries[0].Name()
}
return ""
}
}
}
return ""
}
func getEthNameFromInfiniband(hca string) string {
netDir := filepath.Join(common.SysClassInfiniband, hca, "device", "net")
entries, err := os.ReadDir(netDir)
if err != nil {
hwlog.RunLog.Errorf("Error reading infiniband net directory %s: %v", netDir, err)
return ""
}
if len(entries) > 0 {
return entries[0].Name()
}
return ""
}
func GetHcaIpAddr(ethName string) string {
if ethName == "" {
return ""
}
iface, err := net.InterfaceByName(ethName)
if err != nil {
hwlog.RunLog.Errorf("Error getting interface %s: %v", ethName, err)
return ""
}
addrs, err := iface.Addrs()
if err != nil {
hwlog.RunLog.Errorf("Error getting addresses for interface %s: %v", ethName, err)
return ""
}
var ipv6Addr string
for _, addr := range addrs {
ipNet, ok := addr.(*net.IPNet)
if ok && !ipNet.IP.IsLoopback() {
if ipNet.IP.To4() != nil {
return ipNet.IP.String()
}
if ipv6Addr == "" {
ipv6Addr = ipNet.IP.String()
}
}
}
return ipv6Addr
}
func BuildDPUInfoCfg(results []FaultResult) DpuInfoCfg {
var cfg DpuInfoCfg
hcaFaultMap := buildHcaFaultMap(results)
hcaBasicMap := buildHcaBasicMap(results)
hcaNames := make([]string, 0, len(hcaBasicMap))
for hcaName := range hcaBasicMap {
hcaNames = append(hcaNames, hcaName)
}
sort.Strings(hcaNames)
for _, hcaName := range hcaNames {
faults := getHcaFaults(hcaName, hcaFaultMap)
cfg.DPUInfo.DPUList = append(cfg.DPUInfo.DPUList, buildDPUItem(hcaName, hcaBasicMap[hcaName], faults))
}
cfg.DPUInfo.NodeEvent = buildNodeEvent(results)
cfg.UpdateTime = time.Now()
return cfg
}
func buildHcaFaultMap(results []FaultResult) map[string][]FaultDetail {
hcaFaultMap := make(map[string][]FaultDetail)
activeKeys := make(map[string]bool)
faultTimeCacheMu.Lock()
defer faultTimeCacheMu.Unlock()
for _, fr := range results {
if fr.Hca == "" || fr.Result != "true" {
continue
}
cacheKey := fmt.Sprintf("%s:%s", fr.Hca, fr.Fault.FaultCode)
activeKeys[cacheKey] = true
detectTime, exists := faultTimeCache[cacheKey]
if !exists {
detectTime = getCurrentTimeMs()
faultTimeCache[cacheKey] = detectTime
hwlog.RunLog.Errorf("New fault detected: Hca=%s, Code=%s, Level=%s, Description=%s, Time=%d",
fr.Hca, fr.Fault.FaultCode, fr.Fault.FaultLevel, fr.Fault.Description, detectTime)
hwlog.RunLog.Errorf("Details: %s", fr.Details)
}
faultDetail := FaultDetail{
FaultCode: fr.Fault.FaultCode,
Time: detectTime,
Description: fr.Fault.Description,
FaultLevel: fr.Fault.FaultLevel,
}
hcaFaultMap[fr.Hca] = append(hcaFaultMap[fr.Hca], faultDetail)
}
for cacheKey := range faultTimeCache {
if strings.HasPrefix(cacheKey, "node:") {
continue
}
if !activeKeys[cacheKey] {
delete(faultTimeCache, cacheKey)
}
}
return hcaFaultMap
}
func buildHcaBasicMap(results []FaultResult) map[string]hcaBasicInfo {
hcaBasicMap := make(map[string]hcaBasicInfo)
for _, fr := range results {
if fr.Hca == "" {
continue
}
if _, exists := hcaBasicMap[fr.Hca]; exists {
continue
}
hcaBasicMap[fr.Hca] = buildHcaBasicInfo(fr.Hca)
}
return hcaBasicMap
}
func buildHcaBasicInfo(hca string) hcaBasicInfo {
ethName := GetHcaEthName(hca)
return hcaBasicInfo{
DeviceID: GetHcaDeviceID(hca),
VendorID: GetHcaVendor(hca),
EthName: ethName,
IpAddr: GetHcaIpAddr(ethName),
}
}
func getHcaFaults(hcaName string, hcaFaultMap map[string][]FaultDetail) []FaultDetail {
if faults, hasFaults := hcaFaultMap[hcaName]; hasFaults {
return faults
}
return []FaultDetail{}
}
func buildNodeEvent(results []FaultResult) *NodeEvent {
faultTimeCacheMu.Lock()
defer faultTimeCacheMu.Unlock()
nodeFaults := make([]FaultDetail, 0)
activeKeys := make(map[string]bool)
for _, fr := range results {
if fr.Hca != "" || fr.Result != "true" {
continue
}
cacheKey := fmt.Sprintf("node:%s", fr.Fault.FaultCode)
activeKeys[cacheKey] = true
detectTime, exists := faultTimeCache[cacheKey]
if !exists {
detectTime = getCurrentTimeMs()
faultTimeCache[cacheKey] = detectTime
hwlog.RunLog.Errorf("Node fault detected: Code=%s, Level=%s, Description=%s, Time=%d",
fr.Fault.FaultCode, fr.Fault.FaultLevel, fr.Fault.Description, detectTime)
hwlog.RunLog.Errorf("Fault result details: %s", fr.Details)
}
nodeFaults = append(nodeFaults, FaultDetail{
FaultCode: fr.Fault.FaultCode,
Time: detectTime,
Description: fr.Fault.Description,
FaultLevel: fr.Fault.FaultLevel,
})
}
for cacheKey := range faultTimeCache {
if !strings.HasPrefix(cacheKey, "node:") {
continue
}
if !activeKeys[cacheKey] {
delete(faultTimeCache, cacheKey)
}
}
nodeName, err := util.GetNodeName()
if err != nil {
hwlog.RunLog.Errorf("Failed to get node name for NodeEvent: %v", err)
return nil
}
return &NodeEvent{
NodeName: nodeName,
FaultList: nodeFaults,
}
}
func buildDPUItem(hcaName string, basicInfo hcaBasicInfo, faults []FaultDetail) DPUItem {
return DPUItem{
HcaName: hcaName,
EthName: basicInfo.EthName,
IpAddr: basicInfo.IpAddr,
DeviceID: basicInfo.DeviceID,
VendorID: basicInfo.VendorID,
FaultList: faults,
}
}
func ReadFile(path string) string {
data, err := utils.ReadLimitBytesWithSymlink(path, 1024, validateSysfsPath)
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}