Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package server
import (
"encoding/json"
"fmt"
"net"
"sync"
"k8s.io/api/core/v1"
"Ascend-device-plugin/pkg/common"
"Ascend-device-plugin/pkg/device"
"ascend-common/api"
"ascend-common/common-utils/hwlog"
"ascend-common/common-utils/utils"
npuCommon "ascend-common/devmanager/common"
)
const npuNicMappingConfigPath = "/user/mindx-dl/npu/npu-nic-mapping.json"
type NpuNicMapping struct {
NpuNics []NpuNicItem `json:"npuNics"`
}
type NpuNicItem struct {
NpuId int `json:"npuId"`
NicNames []string `json:"nicNames"`
}
var (
npuNicMappingCache *NpuNicMapping
npuNicMappingCacheOnce sync.Once
npuNicMappingErr error
)
func (hdm *HwDevManager) getCardType() (string, error) {
boardInfo, err := hdm.manager.GetDmgr().GetBoardInfo(hdm.allInfo.AllDevs[common.FirstDevice].LogicID)
if err != nil {
return "", err
}
if boardInfo.BoardId != npuCommon.A5300IBoardId && boardInfo.BoardId != npuCommon.A5300IBoardId2 &&
boardInfo.BoardId != npuCommon.A5300IBoardId3 {
return "", nil
}
mainBoardId := hdm.manager.GetDmgr().GetMainBoardId()
if mainBoardId == common.A5300IMainBoardId {
return common.A5300ICardName, nil
}
if mainBoardId == common.A5300I4PMainBoardId {
return common.A54P300ICardName, nil
}
return "", nil
}
func (hdm *HwDevManager) getProductInfo() *ProductBase {
if hdm.manager == nil {
return nil
}
dmgr := hdm.manager.GetDmgr()
if dmgr == nil {
return nil
}
mainBoardId := int(dmgr.GetMainBoardId())
return &ProductBase{
superPodSize: uint32(hdm.manager.GetSuperPodSize()),
superPodID: uint32(hdm.manager.GetSuperPodID()),
serverIndex: uint32(hdm.manager.GetServerIndex()),
chassisID: uint32(hdm.manager.GetRackID()),
superPodType: uint32(hdm.manager.GetSuperPodType()),
nodeInternalIP: hdm.manager.GetNodeInternalIPInK8s(),
cardType: common.ParamOption.CardType,
mainBoardId: mainBoardId,
maxNpuCount: npuCommon.GetMaxNpuCountPerNode(mainBoardId),
}
}
func (hdm *HwDevManager) getLevelList(dev *common.NpuDevice) []api.RankLevel {
if common.ParamOption.RealCardType != api.Ascend910A5 {
hwlog.RunLog.Debugf("real card type is %v, no levelList information", common.ParamOption.RealCardType)
return nil
}
if dev == nil {
hwlog.RunLog.Error("input parameter dev is empty")
return nil
}
if npuBase.productInfo = hdm.getProductInfo(); npuBase.productInfo == nil {
return nil
}
if err := npuBase.SetUrmaDeviceInfoByHdm(hdm, dev); err != nil {
hwlog.RunLog.Errorf("set urma device info by hdm failed for LogicID(%d) phyID(%d), err: %v",
dev.LogicID, dev.PhyID, err)
}
maxNpuCount := npuBase.productInfo.maxNpuCount
infoKeyArr := npuBase.getRankLevelInfoKeyArr()
levelList := make([]api.RankLevel, 0)
for level := 0; level < len(infoKeyArr); level++ {
infoKey := infoKeyArr[level]
if infoKey == "" {
continue
}
rankAddrList := hdm.getRankAddrList(level, dev, maxNpuCount)
if len(rankAddrList) == 0 {
hwlog.RunLog.Warnf("rank addr list is empty for LogicID(%d) phyID(%d) level(%d) netType(%s)",
dev.LogicID, dev.PhyID, level, infoKey)
continue
}
info := map[string]api.LevelElement{
infoKey: {
NetLayer: level,
NetInstanceID: npuBase.getID(level),
NetType: npuBase.getNetTypeForLevel(level),
NetAttr: api.NetAttrEmpty,
RankAddrList: rankAddrList,
},
}
levelList = append(levelList, api.RankLevel{Level: level, Info: info})
}
return levelList
}
func (hdm *HwDevManager) getRankAddrList(level int, dev *common.NpuDevice, maxNpuNum int) []api.RankAddrItem {
if dev == nil {
return nil
}
product := hdm.getProductInfo()
if product == nil {
return nil
}
if level == api.RankLevel3 {
return hdm.getROCEAddrList(dev, maxNpuNum)
}
if product.isStandCard() {
return hdm.getRankAddrListOriginal(level, dev)
}
urmaList := hdm.GetUrmaDeviceList(dev)
if len(urmaList) == 0 {
return nil
}
parsed := ParseUrmaDevices(urmaList)
if product.isPodScene() {
return npuBase.buildPodRankAddrListParsed(level, dev, parsed)
}
if product.isServer() {
return npuBase.buildServerRankAddrListParsed(level, parsed)
}
return nil
}
func (hdm *HwDevManager) GetUrmaDeviceList(dev *common.NpuDevice) []*UrmaDevice {
dmgr := hdm.manager.GetDmgr()
if dmgr == nil {
return nil
}
infoList, err := dmgr.GetUrmaDevEidListAll(dev.LogicID)
if err != nil {
return nil
}
result := make([]*UrmaDevice, 0)
for _, info := range infoList {
u := &UrmaDevice{
EidList: make([]string, 0),
}
for i := 0; i < int(info.EidCount); i++ {
raw := info.EidInfos[i].Eid.Raw[:]
eid := RawBytesToEidString(raw)
u.EidList = append(u.EidList, eid)
}
result = append(result, u)
}
return result
}
func (hdm *HwDevManager) getRankAddrListOriginal(level int, dev *common.NpuDevice) []api.RankAddrItem {
netType, feIdList := npuBase.getNetTypeAndFeIDListByRankLevel(level)
rankAddrList := make([]api.RankAddrItem, 0)
for _, feId := range feIdList {
addrs := npuBase.getRandAddrByFuncEntityID(dev.PhyID, feId, netType, level)
rankAddrList = append(rankAddrList, addrs...)
}
return rankAddrList
}
func getNpuNicMappingCache() (*NpuNicMapping, error) {
npuNicMappingCacheOnce.Do(func() {
data, err := utils.LoadFile(npuNicMappingConfigPath)
if err != nil {
npuNicMappingErr = fmt.Errorf("read config file error: %v", err)
return
}
if data == nil {
hwlog.RunLog.Warnf("npu-nic-mapping config file not found: %s", npuNicMappingConfigPath)
npuNicMappingCache = nil
return
}
var mapping NpuNicMapping
if err = json.Unmarshal(data, &mapping); err != nil {
npuNicMappingErr = fmt.Errorf("parse config file error: %v", err)
return
}
npuNicMappingCache = &mapping
hwlog.RunLog.Infof("npu-nic-mapping config loaded: %v", mapping)
})
return npuNicMappingCache, npuNicMappingErr
}
func getIPAddressType(ip string) string {
parsedIP := net.ParseIP(ip)
if parsedIP == nil {
return addrTypeIPV4
}
if parsedIP.To4() != nil {
return addrTypeIPV4
}
return addrTypeIPV6
}
func getInterfaceIPsByPriority(nicNames []string) (string, error) {
for _, nicName := range nicNames {
ips := getInterfaceIPs(nicName)
if len(ips) > 0 {
return ips[0], nil
}
hwlog.RunLog.Warnf("interface %s has no valid IP address, checking next interface", nicName)
}
return "", fmt.Errorf("no valid IP address found for any interface: %v", nicNames)
}
func getInterfaceIPs(nicName string) []string {
var ips []string
iface, err := net.InterfaceByName(nicName)
if err != nil {
hwlog.RunLog.Errorf("get interface %s error: %v", nicName, err)
return ips
}
addrs, err := iface.Addrs()
if err != nil || len(addrs) == 0 {
hwlog.RunLog.Errorf("get interface %s addrs error: %v", nicName, err)
return ips
}
for _, addr := range addrs {
ipNet, ok := addr.(*net.IPNet)
if !ok {
continue
}
ip := ipNet.IP
if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
continue
}
ips = append(ips, ip.String())
}
return ips
}
func getNpuToNicNames(npuId int) ([]string, error) {
mapping, err := getNpuNicMappingCache()
if err != nil {
return nil, err
}
if mapping == nil {
return nil, nil
}
for _, item := range mapping.NpuNics {
if item.NpuId == npuId {
return item.NicNames, nil
}
}
return nil, fmt.Errorf("npuId %d not found in mapping", npuId)
}
func (hdm *HwDevManager) getROCEAddrList(dev *common.NpuDevice, maxNpuNum int) []api.RankAddrItem {
if dev == nil {
hwlog.RunLog.Error("device is nil")
return []api.RankAddrItem{}
}
npuId := int(dev.PhyID % int32(maxNpuNum))
nicNames, err := getNpuToNicNames(npuId)
if err != nil {
hwlog.RunLog.Warnf("get npu %d nic names failed: %v, returning empty addr list", npuId, err)
return []api.RankAddrItem{}
}
if nicNames == nil {
hwlog.RunLog.Warnf("npu-nic-mapping config not found, returning empty addr list")
return []api.RankAddrItem{}
}
ip, err := getInterfaceIPsByPriority(nicNames)
if err != nil {
hwlog.RunLog.Errorf("get roce addr list failed: %v", err)
return []api.RankAddrItem{}
}
addrType := getIPAddressType(ip)
hwlog.RunLog.Infof("get RoCE addr for NPU %d: %s (type: %s)", npuId, ip, addrType)
return []api.RankAddrItem{
{
AddrType: addrType,
Addr: ip,
Ports: []string{},
PlaneId: api.DefaultRandAddrPlaneID,
},
}
}
func (hdm *HwDevManager) GetDevManager() device.DevManager {
return hdm.manager
}
func (hdm *HwDevManager) GetRackID() int32 {
return hdm.manager.GetRackID()
}
func (hdm *HwDevManager) GetSuperPodID() int32 {
return hdm.manager.GetSuperPodID()
}
func (hdm *HwDevManager) GetSuperPodType() int32 {
return hdm.manager.GetSuperPodType()
}
func (hdm *HwDevManager) SetNodeInternalIPInK8s(node *v1.Node) {
if common.ParamOption.RealCardType != api.Ascend910A5 {
hwlog.RunLog.Infof("real card type is %v, no need server ip in k8s", common.ParamOption.RealCardType)
return
}
if node == nil {
hwlog.RunLog.Error("node is empty")
return
}
internalIP := ""
for _, addr := range node.Status.Addresses {
if addr.Type == v1.NodeInternalIP {
internalIP = addr.Address
break
}
}
hdm.manager.SetNodeInternalIPInK8s(internalIP)
return
}