Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devmanager
import (
"errors"
"fmt"
"math"
"strings"
"sync"
"time"
"ascend-common/api"
"ascend-common/common-utils/hwlog"
"ascend-common/common-utils/utils"
"ascend-common/devmanager/common"
"ascend-common/devmanager/dcmi"
)
type DeviceInterface interface {
Init() error
ShutDown() error
GetDcmiVersion() string
GetAllDeviceCount() (int32, error)
GetCardList() (int32, []int32, error)
GetDeviceNumInCard(cardID int32) (int32, error)
GetDeviceList() (int32, []int32, error)
GetChipBaseInfos() ([]*common.ChipBaseInfo, error)
GetDeviceHealth(logicID int32) (uint32, error)
GetDeviceNetWorkHealth(logicID int32) (uint32, error)
GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error)
GetDeviceUtilizationRateV2(logicID int32) (common.DcmiMultiUtilizationInfo, error)
GetDeviceUtilizationRateV2Period(logicID int32) (common.DcmiMultiUtilizationInfo, error)
GetDeviceUtilizationRateCommon(logicID int32) (common.DcmiMultiUtilizationInfo, error)
GetDeviceTemperature(logicID int32) (int32, error)
GetDeviceVoltage(logicID int32) (float32, error)
GetDevicePowerInfo(logicID int32) (float32, error)
GetMcuPowerInfo(cardID int32) (float32, error)
GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error)
GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error)
GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error)
GetDeviceErrorCode(logicID int32) (int32, int64, error)
GetChipInfo(logicID int32) (*common.ChipInfo, error)
GetPhysicIDFromLogicID(logicID int32) (int32, error)
GetLogicIDFromPhysicID(physicID int32) (int32, error)
GetDeviceLogicID(cardID, deviceID int32) (int32, error)
GetCardIDDeviceID(logicID int32) (int32, int32, error)
GetDeviceIPAddress(logicID, ipType int32) (string, error)
CreateVirtualDevice(logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error)
GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error)
DestroyVirtualDevice(logicID int32, vDevID uint32) error
GetDevType() string
GetProductTypeArray() []string
GetProductType(logicID int32) (string, error)
GetAllProductType() ([]string, error)
GetNpuWorkMode() string
SetDeviceReset(logicID int32) error
GetBrotherCardID(logicID int32) (int32, error)
PreResetSoc(logicID int32) error
GetOutBandChannelState(logicID int32) error
SetDeviceResetOutBand(logicID int32) error
RescanSoc(logicID int32) error
GetDeviceBootStatus(logicID int32) (int, error)
GetDeviceAllErrorCode(logicID int32) (int32, []int64, error)
GetDeviceAllErrorCodeWithTimeOut(logicID int32, timeout time.Duration) (int32, []int64, error)
SubscribeDeviceFaultEvent(logicID int32) error
SetFaultEventCallFunc(func(common.DevFaultInfo)) error
GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error)
GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error)
GetPCIeBusInfo(logicID int32) (string, error)
GetBoardInfo(logicID int32) (common.BoardInfo, error)
GetCardElabelV2(cardID int32) (common.ElabelInfo, error)
GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error)
SetIsTrainingCard() error
IsTrainingCard() bool
GetValidChipInfo() (common.ChipInfo, error)
GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error)
GetSuperPodInfo(int32) (common.CgoSuperPodInfo, error)
GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error)
GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error)
GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error)
GetMainBoardId() uint32
GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error)
StartHccsPingMesh(logicID int32, portID int, operate common.HccspingMeshOperate) error
StopHccsPingMesh(logicID int32, portID int, taskID uint) error
GetHccsPingMeshInfo(logicID int32, portID int, taskID uint) (*common.HccspingMeshInfo, error)
GetHccsPingMeshState(logicID int32, portID int, taskID uint) (int, error)
GetSuperPodStatus(int32, uint32) (int, error)
SetSuperPodStatus(int32, uint32, uint32) error
GetMultiDiePolicy() (dcmi.DiePolicyType, error)
SetMultiDiePolicy(dcmi.DiePolicyType) error
GetUrmaDeviceCount(int32) (int32, error)
GetUrmaDevEidList(int32, int32) (*common.UrmaDeviceInfo, error)
GetUrmaDevEidListAll(int32) ([]common.UrmaDeviceInfo, error)
GetValidBoardInfo() (common.BoardInfo, error)
GetValidMainBoardInfo() (uint32, error)
WaitDeviceOnline(resetTimeout int)
}
const (
maxRetries = 6
defaultRetryDelay = 10
)
var (
devManager *DeviceManager = nil
idCache sync.Map
devManagerOnce sync.Once
)
var _ DeviceInterface = &DeviceManager{}
type npuIdMapping struct {
logicId int32
cardId int32
deviceId int32
}
type DeviceManager struct {
DcMgr dcmi.DcDriverInterface
DevType string
ProductTypes []string
isTrainingCard bool
dcmiVersion string
dcmiApiVersion string
mainBoardId uint32
utilizationFuncCache utilizationFuncCache
unsupportedDeviceTypeCache unsupportedDeviceTypeCache
}
type deviceCommonInitManager struct {
DeviceManager
}
func (d *deviceCommonInitManager) SetDcmiVersion() {
dcmiVersion, err := d.DcMgr.DcGetDcmiVersion()
if err != nil {
hwlog.RunLog.Warnf("deviceManager get dcmi version failed, err: %v", err)
}
d.dcmiVersion = dcmiVersion
}
func (d *deviceCommonInitManager) GetDcmiApiVersion() string {
return d.dcmiApiVersion
}
func (d *deviceCommonInitManager) SetValidMainBoardInfo() error {
cardNum, cardList, err := d.DcMgr.DcGetCardList()
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf(common.ErrMsgInitCardListFailed)
}
if cardNum == 0 {
return fmt.Errorf(common.ErrMsgGetBoardInfoFailed)
}
for _, cardID := range cardList {
devNum, err := d.DcMgr.DcGetDeviceNumInCard(cardID)
if err != nil || devNum == 0 {
hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err)
continue
}
for devID := int32(0); devID < devNum; devID++ {
mainBoardId, err := d.DcMgr.DcGetDeviceMainBoardInfo(cardID, devID)
if err != nil {
hwlog.RunLog.Debug(err)
continue
}
if !common.IsValidMainBoardInfo(mainBoardId) {
hwlog.RunLog.Warnf("invalid mainBoardId info by cardID(%d), deviceID(%d), error: %v", cardID, devID,
err)
continue
}
d.mainBoardId = mainBoardId
return nil
}
}
return errors.New("cannot get main board id")
}
func (d *deviceCommonInitManager) SetDcManger(dcMgr interface{}) error {
dcMgrV1, ok := dcMgr.(dcmi.DcDriverInterface)
if !ok {
return fmt.Errorf("DcManger type is %T, need dcmi.DcDriverInterface", dcMgr)
}
d.DcMgr = dcMgrV1
return nil
}
func (d *deviceCommonInitManager) GetDcManager() DeviceInterface {
return &d.DeviceManager
}
func (d *deviceCommonInitManager) SetDevType(devType string) {
d.DevType = devType
}
func (d *deviceCommonInitManager) SetAllProductType() error {
productTypes := make([]string, 0)
cardNum, cardList, err := d.GetCardList()
if err != nil || cardNum == 0 {
hwlog.RunLog.Errorf("failed to get card list, err: %v", err)
d.ProductTypes = productTypes
return err
}
for _, cardID := range cardList {
devNum, err := d.GetDeviceNumInCard(cardID)
if err != nil {
hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err)
continue
}
if devNum == 0 {
hwlog.RunLog.Debugf("not found device on card %d", cardID)
continue
}
for devID := int32(0); devID < devNum; devID++ {
devLogicId, err := d.GetDeviceLogicID(cardID, devID)
if err != nil {
hwlog.RunLog.Debugf("get device logic id by card %d deviceID %d failed, err: %v", cardID, devID,
err)
continue
}
productType, err := d.GetProductType(devLogicId)
if err != nil {
hwlog.RunLog.Debugf("get product type by card %d deviceID %d failed, err: %v", cardID, devID, err)
continue
}
productTypes = append(productTypes, productType)
break
}
}
if len(productTypes) != 0 {
productTypes = common.RemoveDuplicate(&productTypes)
}
for _, product := range productTypes {
if product == api.Atlas300IDuo {
isContainAtlas300IDuo = true
}
}
d.ProductTypes = productTypes
return nil
}
func GetDeviceManager(resetTimeout int) (*DeviceManager, error) {
devManagerOnce.Do(func() {
dcMgr := dcmi.DcManager{}
var retryDelay time.Duration = defaultRetryDelay
hwlog.RunLog.Infof("get card list from dcmi reset timeout is %d", resetTimeout)
for currentTime, retryCount := 0, 0; currentTime <= resetTimeout; currentTime += int(retryDelay) {
if err := dcMgr.DcInit(); err != nil {
hwlog.RunLog.Errorf("deviceManager init failed, prepare dcmi failed, err: %v", err)
return
}
cardNum, cardList, err := dcMgr.DcGetCardList()
if err == nil && int(cardNum) == len(cardList) {
hwlog.RunLog.Infof("deviceManager get cardList is %v, cardList length equal to cardNum: %v",
cardList, cardNum)
break
}
if diffTime := float64(resetTimeout - currentTime); diffTime > 0 {
retryDelay = time.Duration(math.Min(float64(defaultRetryDelay), diffTime))
}
retryCount++
hwlog.RunLog.Warnf("deviceManager get card list failed (attempt %d), cardNum=%d, cardList=%v, "+
"err: %v", retryCount, cardNum, cardList, err)
if currentTime+int(retryDelay) <= resetTimeout {
if err = dcMgr.DcShutDown(); err != nil {
hwlog.RunLog.Errorf("deviceManager shut down failed, err: %v", err)
return
}
time.Sleep(retryDelay * time.Second)
continue
}
if int(cardNum) != len(cardList) {
hwlog.RunLog.Warnf("deviceManager get cardList is %v, but cardNum is %v, "+
"please check whether the real number of npu matches the cardList", cardList, cardNum)
}
}
devManager = &DeviceManager{}
devManager.DcMgr = &dcMgr
dcmiVer, err := dcMgr.DcGetDcmiVersion()
if err != nil {
hwlog.RunLog.Warnf("deviceManager get dcmi version failed, err: %v", err)
}
hwlog.RunLog.Infof("the dcmi version is %s", dcmiVer)
devManager.dcmiVersion = dcmiVer
})
if devManager == nil {
return nil, errors.New("device Manager is nil, may encounter an exception during initialization. " +
"You can check the system log to confirm")
}
return devManager, nil
}
func (d *DeviceManager) WaitDeviceOnline(resetTimeout int) {
if d == nil {
hwlog.RunLog.Error("wait device online failed, mgr is empty")
return
}
devManagerOnce.Do(func() {
var retryDelay time.Duration = defaultRetryDelay
hwlog.RunLog.Infof("get card list from dcmi reset timeout is %d", resetTimeout)
for currentTime, retryCount := 0, 0; currentTime <= resetTimeout; currentTime += int(retryDelay) {
if err := d.Init(); err != nil {
hwlog.RunLog.Errorf("deviceManager init failed, prepare dcmi failed, err: %v", err)
return
}
cardNum, cardList, err := d.GetCardList()
if err == nil && int(cardNum) == len(cardList) {
hwlog.RunLog.Infof("deviceManager get cardList is %v, cardList length equal to cardNum: %v",
cardList, cardNum)
break
}
if diffTime := float64(resetTimeout - currentTime); diffTime > 0 {
retryDelay = time.Duration(math.Min(float64(defaultRetryDelay), diffTime))
}
retryCount++
hwlog.RunLog.Warnf("deviceManager get card list failed (attempt %d), cardNum=%d, cardList=%v, "+
"err: %v", retryCount, cardNum, cardList, err)
if currentTime+int(retryDelay) <= resetTimeout {
if err = d.ShutDown(); err != nil {
hwlog.RunLog.Errorf("deviceManager shut down failed, err: %v", err)
return
}
time.Sleep(retryDelay * time.Second)
continue
}
if int(cardNum) != len(cardList) {
hwlog.RunLog.Warnf("deviceManager get cardList is %v, but cardNum is %v, "+
"please check whether the real number of npu matches the cardList", cardList, cardNum)
}
}
})
}
func (d *DeviceManager) GetProductTypeArray() []string {
return d.ProductTypes
}
func (d *DeviceManager) GetDevType() string {
return d.DevType
}
func (d *DeviceManager) GetValidChipInfo() (common.ChipInfo, error) {
cardNum, cardList, err := d.DcMgr.DcGetCardList()
if err != nil {
hwlog.RunLog.Error(err)
return common.ChipInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed)
}
if cardNum == 0 {
return common.ChipInfo{}, fmt.Errorf("get chip info failed, no card found")
}
for _, cardID := range cardList {
devNum, err := d.DcMgr.DcGetDeviceNumInCard(cardID)
if err != nil || devNum == 0 {
hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %v", cardID, err)
continue
}
for devID := int32(0); devID < devNum; devID++ {
chipInfo, err := d.DcMgr.DcGetChipInfo(cardID, devID)
if err != nil {
hwlog.RunLog.Debugf("get chip info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID,
err)
continue
}
if !common.IsValidChipInfo(chipInfo) {
hwlog.RunLog.Debugf("invalid chip info by cardID(%d), deviceID(%d), error: %v", cardID, devID,
err)
continue
}
return *chipInfo, nil
}
}
return common.ChipInfo{}, errors.New("cannot get valid chip info")
}
func (d *DeviceManager) GetValidMainBoardInfo() (uint32, error) {
cardNum, cardList, err := d.DcMgr.DcGetCardList()
if err != nil {
hwlog.RunLog.Error(err)
return 0, fmt.Errorf(common.ErrMsgInitCardListFailed)
}
if cardNum == 0 {
return 0, fmt.Errorf(common.ErrMsgGetBoardInfoFailed)
}
for _, cardID := range cardList {
devNum, err := d.DcMgr.DcGetDeviceNumInCard(cardID)
if err != nil || devNum == 0 {
hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err)
continue
}
for devID := int32(0); devID < devNum; devID++ {
mainBoardId, err := d.DcMgr.DcGetDeviceMainBoardInfo(cardID, devID)
if err != nil {
hwlog.RunLog.Debug(err)
continue
}
if !common.IsValidMainBoardInfo(mainBoardId) {
hwlog.RunLog.Warnf("invalid mainBoardId info by cardID(%d), deviceID(%d), error: %v", cardID, devID,
err)
continue
}
return mainBoardId, nil
}
}
return 0, errors.New("cannot get main board id")
}
func (d *DeviceManager) GetValidBoardInfo() (common.BoardInfo, error) {
cardNum, cardList, err := d.DcMgr.DcGetCardList()
if err != nil {
hwlog.RunLog.Error(err)
return common.BoardInfo{}, fmt.Errorf(common.ErrMsgInitCardListFailed)
}
if cardNum == 0 {
return common.BoardInfo{}, fmt.Errorf(common.ErrMsgGetBoardInfoFailed)
}
for _, cardID := range cardList {
devNum, err := d.DcMgr.DcGetDeviceNumInCard(cardID)
if err != nil || devNum == 0 {
hwlog.RunLog.Debugf("get device num by cardID %d failed, error is: %v", cardID, err)
continue
}
for devID := int32(0); devID < devNum; devID++ {
boardInfo, err := d.DcMgr.DcGetDeviceBoardInfo(cardID, devID)
if err != nil {
hwlog.RunLog.Debugf("get board info failed by cardID(%d), deviceID(%d), error: %v", cardID, devID,
err)
continue
}
if !common.IsValidBoardInfo(&boardInfo) {
hwlog.RunLog.Debugf("invalid board info by cardID(%d), deviceID(%d), error: %v", cardID, devID,
err)
continue
}
return boardInfo, nil
}
}
return common.BoardInfo{}, errors.New("cannot get valid board info")
}
func (d *DeviceManager) Init() error {
return d.DcMgr.DcInit()
}
func (d *DeviceManager) ShutDown() error {
return d.DcMgr.DcShutDown()
}
func (d *DeviceManager) GetAllDeviceCount() (int32, error) {
return d.DcMgr.DcGetAllDeviceCount()
}
func (d *DeviceManager) GetCardList() (int32, []int32, error) {
return d.DcMgr.DcGetCardList()
}
func (d *DeviceManager) GetDeviceNumInCard(cardID int32) (int32, error) {
return d.DcMgr.DcGetDeviceNumInCard(cardID)
}
func (d *DeviceManager) GetDeviceList() (int32, []int32, error) {
return d.DcMgr.DcGetLogicIDList()
}
func (d *DeviceManager) GetDeviceHealth(logicID int32) (uint32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get health code by logicID(%d)", logicID)
}
healthCode, err := d.DcMgr.DcGetDeviceHealth(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, err
}
return uint32(healthCode), nil
}
func (d *DeviceManager) GetDeviceNetWorkHealth(logicID int32) (uint32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get network health code by logicID(%d)", logicID)
}
healthCode, err := d.DcMgr.DcGetDeviceNetWorkHealth(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, err
}
return healthCode, nil
}
func (d *DeviceManager) GetDeviceUtilizationRate(logicID int32, deviceType common.DeviceType) (uint32, error) {
if d.unsupportedDeviceTypeCache.isUnsupported(deviceType.Code) {
return common.UnRetError, fmt.Errorf("device type %s is not supported (cached)", deviceType.Name)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get utilization by logicID(%d)", logicID)
}
rate, err := d.DcMgr.DcGetDeviceUtilizationRate(cardID, deviceID, deviceType)
if err != nil {
if strings.Contains(err.Error(), common.NotSupportErrorCode) {
d.unsupportedDeviceTypeCache.markAsUnsupported(deviceType.Code)
}
return common.UnRetError, err
}
return uint32(rate), nil
}
func (d *DeviceManager) GetDeviceUtilizationRateV2(logicID int32) (common.DcmiMultiUtilizationInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return dcmi.BuildErrNpuMultiUtilizationInfo(),
fmt.Errorf("failed to get cardId and deviceId by logicID(%d)", logicID)
}
res, err := d.DcMgr.DcGetDeviceUtilizationRateV2(cardID, deviceID)
if err != nil {
return dcmi.BuildErrNpuMultiUtilizationInfo(), err
}
return res, nil
}
func (d *DeviceManager) GetDeviceUtilizationRateV2Period(logicID int32) (common.DcmiMultiUtilizationInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return dcmi.BuildErrNpuMultiUtilizationInfo(),
fmt.Errorf("failed to get cardId and deviceId by logicID(%d)", logicID)
}
res, err := d.DcMgr.DcGetDeviceUtilizationRateV2Period(cardID, deviceID)
if err != nil {
return dcmi.BuildErrNpuMultiUtilizationInfo(), err
}
return res, nil
}
func (d *DeviceManager) GetDeviceUtilizationRateCommon(logicID int32) (common.DcmiMultiUtilizationInfo, error) {
if fn := d.utilizationFuncCache.get(); fn != nil {
return fn(logicID)
}
return d.determineAndCacheUtilizationFunc(logicID)
}
func (d *DeviceManager) determineAndCacheUtilizationFunc(logicID int32) (common.DcmiMultiUtilizationInfo, error) {
fn, res, err := determineUtilizationFunc(logicID, []utilizationCandidate{
{fn: d.GetDeviceUtilizationRateV2Period, dcmiApiName: "dcmi_get_device_multi_utilization_rate_period"},
{fn: d.GetDeviceUtilizationRateV2, dcmiApiName: "dcmi_get_device_multi_utilization_rate"},
{fn: d.getDeviceUtilizationRateV1, dcmiApiName: "dcmi_get_device_utilization_rate"},
})
d.utilizationFuncCache.set(fn)
return res, err
}
func (d *DeviceManager) getDeviceUtilizationRateV1(logicID int32) (common.DcmiMultiUtilizationInfo, error) {
return getDeviceUtilizationRateV1Common(logicID, d.GetDeviceUtilizationRate)
}
func (d *DeviceManager) GetDeviceTemperature(logicID int32) (int32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID)
}
temp, err := d.DcMgr.DcGetDeviceTemperature(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, fmt.Errorf("failed to get temperature by logicID(%d)", logicID)
}
return temp, nil
}
func (d *DeviceManager) GetDeviceVoltage(logicID int32) (float32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID)
}
voltage, err := d.DcMgr.DcGetDeviceVoltage(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get voltage by logicID(%d)", logicID)
}
return voltage, nil
}
func (d *DeviceManager) GetDevicePowerInfo(logicID int32) (float32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID)
}
power, err := d.DcMgr.DcGetDevicePowerInfo(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get power by logicID(%d)", logicID)
}
return power, nil
}
func (d *DeviceManager) GetDeviceFrequency(logicID int32, deviceType common.DeviceType) (uint32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID)
}
frequency, err := d.DcMgr.DcGetDeviceFrequency(cardID, deviceID, deviceType)
if err != nil {
hwlog.RunLog.Error(err)
return common.UnRetError, fmt.Errorf("failed to get frequency by logicID(%d)", logicID)
}
return frequency, nil
}
func (d *DeviceManager) GetDeviceMemoryInfo(logicID int32) (*common.MemoryInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID)
}
if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 || d.DevType == api.Ascend910A5 {
hwlog.RunLog.Debugf("%s doesn't have DDR module. Therefore, DDR information cannot be queried",
utils.MaskDevType(d.DevType))
return nil, nil
}
memInfo, err := d.DcMgr.DcGetMemoryInfo(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return nil, fmt.Errorf("failed to get memory info by logicID(%d)", logicID)
}
return memInfo, nil
}
func (d *DeviceManager) GetDeviceHbmInfo(logicID int32) (*common.HbmInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return nil, fmt.Errorf("failed to get hbm info by logicID(%d)", logicID)
}
hbmInfo, err := d.DcMgr.DcGetHbmInfo(cardID, deviceID)
if err != nil {
return nil, err
}
return hbmInfo, nil
}
func (d *DeviceManager) GetDeviceErrorCode(logicID int32) (int32, int64, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)",
logicID)
}
errCount, errCode, err := d.DcMgr.DcGetDeviceErrorCode(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, common.RetError, fmt.Errorf("failed to get device error code by logicID(%d)",
logicID)
}
return errCount, errCode, nil
}
func (d *DeviceManager) GetChipInfo(logicID int32) (*common.ChipInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d), error: %v", logicID, err)
}
chipInfo, err := d.DcMgr.DcGetChipInfo(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return nil, fmt.Errorf("failed to get chip info code by logicID(%d)", logicID)
}
return chipInfo, nil
}
func (d *DeviceManager) GetPhysicIDFromLogicID(logicID int32) (int32, error) {
physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, fmt.Errorf("failed to get physicID by logicID(%d)", logicID)
}
return physicID, nil
}
func (d *DeviceManager) GetLogicIDFromPhysicID(physicID int32) (int32, error) {
logicID, err := d.DcMgr.DcGetLogicIDFromPhysicID(physicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, fmt.Errorf("failed to get logicID by physicID(%d)", physicID)
}
return logicID, nil
}
func (d *DeviceManager) GetDeviceLogicID(cardID, deviceID int32) (int32, error) {
return d.DcMgr.DcGetDeviceLogicID(cardID, deviceID)
}
func (d *DeviceManager) GetDeviceIPAddress(logicID, ipType int32) (string, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return "", fmt.Errorf("failed to get cardID and deviceID by logicID(%d), %w", logicID, err)
}
return d.DcMgr.DcGetDeviceIPAddress(cardID, deviceID, ipType)
}
func (d *DeviceManager) CreateVirtualDevice(
logicID int32, vDevInfo common.CgoCreateVDevRes) (common.CgoCreateVDevOut, error) {
if !common.IsValidTemplateName(d.DevType, vDevInfo.TemplateName) {
return common.CgoCreateVDevOut{}, fmt.Errorf("input invalid template name: %s", vDevInfo.TemplateName)
}
return d.DcMgr.DcCreateVDevice(logicID, vDevInfo)
}
func (d *DeviceManager) GetVirtualDeviceInfo(logicID int32) (common.VirtualDevInfo, error) {
cgoVDevInfo, err := d.DcMgr.DcGetVDeviceInfo(logicID)
if err != nil {
hwlog.RunLog.Debug(err)
return common.VirtualDevInfo{}, fmt.Errorf("get virtual device info failed, error is: %v "+
"and vdev num is: %d", err, int32(cgoVDevInfo.TotalResource.VDevNum))
}
for _, vDevInfo := range cgoVDevInfo.VDevInfo {
if !common.IsValidTemplateName(d.DevType, vDevInfo.QueryInfo.Name) {
return common.VirtualDevInfo{}, fmt.Errorf("vdevice id %d, it's template name is invalid: %s",
vDevInfo.VDevID, vDevInfo.QueryInfo.Name)
}
}
return cgoVDevInfo, nil
}
func (d *DeviceManager) DestroyVirtualDevice(logicID int32, vDevID uint32) error {
return d.DcMgr.DcDestroyVDevice(logicID, vDevID)
}
func (d *DeviceManager) GetMcuPowerInfo(cardID int32) (float32, error) {
return d.DcMgr.DcGetMcuPowerInfo(cardID)
}
func (d *DeviceManager) GetCardIDDeviceID(logicID int32) (int32, int32, error) {
return d.getCardIdAndDeviceId(logicID)
}
func (d *DeviceManager) GetProductType(logicID int32) (string, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return "", fmt.Errorf("input invalid logicID: %d", logicID)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return "", fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+
"when get product type, error: %v", logicID, err)
}
return d.DcMgr.DcGetProductType(cardID, deviceID)
}
func (d *DeviceManager) GetAllProductType() ([]string, error) {
if d == nil {
return []string{}, errors.New("nil DeviceManager")
}
return d.ProductTypes, nil
}
func (d *DeviceManager) GetNpuWorkMode() string {
if d.DevType == api.Ascend910B || d.DevType == api.Ascend910A3 || d.DevType == api.Ascend910A5 {
hwlog.RunLog.Warnf("only AMP mode is available on %s", utils.MaskDevType(d.DevType))
return common.AMPMode
}
_, cardList, err := d.DcMgr.DcGetCardList()
if err != nil {
hwlog.RunLog.Error(err)
return ""
}
if len(cardList) > 0 {
mode, err := d.DcMgr.DcGetNpuWorkMode(cardList[0])
if err != nil {
hwlog.RunLog.Error(err)
return ""
}
if mode == 0 {
return common.AMPMode
}
return common.SMPMode
}
return ""
}
func (d *DeviceManager) SetDeviceReset(logicID int32) error {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to get cardID in set device reset by logicID(%d)", logicID)
}
if isContainAtlas300IDuo {
hwlog.RunLog.Infof("isContainAtlas300IDuo is true, cardID(%d) and deviceID(%d)", cardID, deviceID)
deviceID = 0
}
return d.DcMgr.DcSetDeviceReset(cardID, deviceID)
}
func (d *DeviceManager) GetBrotherCardID(logicID int32) (int32, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, fmt.Errorf("failed to get cardID in get brother card id by logicID(%d)",
logicID)
}
return d.DcMgr.DcGetBrotherCardID(cardID, deviceID)
}
func (d *DeviceManager) GetOutBandChannelState(logicID int32) error {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to get cardID in get out band channel state by logicID(%d)", logicID)
}
return d.DcMgr.DcGetOutBandChannelState(cardID, deviceID)
}
func (d *DeviceManager) PreResetSoc(logicID int32) error {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to get cardID in pre reset soc by logicID(%d)", logicID)
}
return d.DcMgr.DcPreResetSoc(cardID, deviceID)
}
func (d *DeviceManager) SetDeviceResetOutBand(logicID int32) error {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to get cardID in set device reset out band by logicID(%d)", logicID)
}
return d.DcMgr.DcSetDeviceResetOutBand(cardID, deviceID)
}
func (d *DeviceManager) RescanSoc(logicID int32) error {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to get cardID in rescan soc by logicID(%d)", logicID)
}
return d.DcMgr.DcRescanSoc(cardID, deviceID)
}
func (d *DeviceManager) GetDeviceBootStatus(logicID int32) (int, error) {
return d.DcMgr.DcGetDeviceBootStatus(logicID)
}
func (d *DeviceManager) GetDeviceAllErrorCode(logicID int32) (int32, []int64, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)",
logicID)
}
errCount, errCodes, err := d.DcMgr.DcGetDeviceAllErrorCode(cardID, deviceID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, nil, fmt.Errorf("failed to get device error code by logicID(%d)", logicID)
}
return errCount, errCodes, nil
}
func (d *DeviceManager) GetDeviceAllErrorCodeWithTimeOut(logicID int32, timeout time.Duration) (int32, []int64, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, nil, fmt.Errorf(
"failed to get cardID and deviceID in get device error code by logicID(%d)", logicID)
}
errCount, errCodes, err := d.DcMgr.DcGetDeviceAllErrorCodeWithTimeout(cardID, deviceID, timeout)
if err != nil {
hwlog.RunLog.Error(err)
return common.RetError, nil, fmt.Errorf("failed to get device error code by logicID(%d)", logicID)
}
return errCount, errCodes, nil
}
func (d *DeviceManager) SubscribeDeviceFaultEvent(logicID int32) error {
var cardID, deviceID int32
if logicID == common.SubscribeAllDevice {
cardID = common.SubscribeAllDevice
deviceID = common.SubscribeAllDevice
} else {
var err error
cardID, deviceID, err = d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to get cardID in subscribe device error code by logicID(%d)", logicID)
}
}
if err := d.DcMgr.DcSubscribeDeviceFaultEvent(cardID, deviceID); err != nil {
hwlog.RunLog.Error(err)
return fmt.Errorf("failed to subscribe device error code by logicID(%d)", logicID)
}
return nil
}
func (d *DeviceManager) SetFaultEventCallFunc(businessFunc func(common.DevFaultInfo)) error {
if businessFunc == nil {
return errors.New("business func can't be nil")
}
d.DcMgr.DcSetFaultEventCallFunc(businessFunc)
return nil
}
func (d *DeviceManager) GetDieID(logicID int32, dcmiDieType dcmi.DieType) (string, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID)
}
return d.DcMgr.DcGetDieID(cardID, deviceID, dcmiDieType)
}
func (d *DeviceManager) GetDevProcessInfo(logicID int32) (*common.DevProcessInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return nil, fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID)
}
return d.DcMgr.DcGetDevProcessInfo(cardID, deviceID)
}
func (d *DeviceManager) GetPCIeBusInfo(logicID int32) (string, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return "", fmt.Errorf("failed to get cardID in get device error code by logicID(%d)", logicID)
}
return d.DcMgr.DcGetPCIeBusInfo(cardID, deviceID)
}
func (d *DeviceManager) GetBoardInfo(logicID int32) (common.BoardInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.BoardInfo{}, fmt.Errorf("failed to get cardID in "+
"get device error code by logicID(%d)", logicID)
}
return d.DcMgr.DcGetDeviceBoardInfo(cardID, deviceID)
}
func (d *DeviceManager) GetCardElabelV2(cardID int32) (common.ElabelInfo, error) {
return d.DcMgr.DcGetCardElabelV2(cardID)
}
func (d *DeviceManager) GetPCIEBandwidth(logicID int32, profilingTime int) (common.PCIEBwStat, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Error(err)
return common.PCIEBwStat{}, fmt.Errorf("get cardID(deviceID) failed, error by logicID(%d)", logicID)
}
pciePCIEBw, err := d.DcMgr.DcGetPCIEBandwidth(cardID, deviceID, profilingTime)
if err != nil {
return common.PCIEBwStat{}, err
}
return pciePCIEBw, nil
}
func (d *DeviceManager) SetIsTrainingCard() error {
devType := d.GetDevType()
if strings.HasPrefix(devType, api.Ascend310) {
d.isTrainingCard = false
return nil
}
boardInfo := common.BoardInfo{}
cardNum, cardList, err := d.GetCardList()
if err != nil || cardNum == 0 {
hwlog.RunLog.Errorf("failed to get card list when set 'IsTrainingCard' err: %v", err)
return err
}
for _, cardID := range cardList {
devNum, err := d.GetDeviceNumInCard(cardID)
if err != nil {
hwlog.RunLog.Warnf("get device num by cardID(%d) failed when set 'IsTrainingCard', error: %v", cardID, err)
continue
}
if devNum == 0 {
hwlog.RunLog.Warnf("not found device on card %d when set 'IsTrainingCard'", cardID)
continue
}
for devID := int32(0); devID < devNum; devID++ {
boardInfo, err = d.DcMgr.DcGetDeviceBoardInfo(cardID, devID)
if err != nil {
hwlog.RunLog.Warnf("get board info by card %d deviceID %d failed, err: %v", cardID, devID, err)
continue
}
break
}
if err == nil {
break
}
}
if devType == api.Ascend910B &&
(boardInfo.BoardId == common.A300IA2BoardId || boardInfo.BoardId == common.A300IA2GB64BoardId || boardInfo.BoardId == common.Atlas200LA2ZQBoardId) {
d.isTrainingCard = false
return nil
}
d.isTrainingCard = true
return nil
}
func (d *DeviceManager) IsTrainingCard() bool {
return d.isTrainingCard
}
func (d *DeviceManager) GetDcmiVersion() string {
return d.dcmiVersion
}
func (d *DeviceManager) GetMainBoardId() uint32 {
return d.mainBoardId
}
func (d *DeviceManager) GetDeviceEccInfo(logicID int32, dcmiDeviceType common.DcmiDeviceType) (*common.ECCInfo, error) {
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
hwlog.RunLog.Errorf("get cardID and deviceID by logicID(%d) failed, error: %v", logicID, err)
return nil, err
}
return d.DcMgr.DcGetDeviceEccInfo(cardID, deviceID, dcmiDeviceType)
}
func (d *DeviceManager) GetSuperPodInfo(logicID int32) (common.CgoSuperPodInfo, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return common.CgoSuperPodInfo{}, fmt.Errorf("input invalid logicID: %d", logicID)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+
"when get super pod info, error: %v", logicID, err)
}
cgoSuperPodInfo, err := d.DcMgr.DcGetSuperPodInfo(cardID, deviceID)
if err != nil {
return common.CgoSuperPodInfo{}, fmt.Errorf("failed to get super pod info by logicID(%d), error: %v",
logicID, err)
}
return cgoSuperPodInfo, nil
}
func (d *DeviceManager) GetSioInfo(logicID int32) (*common.SioCrcErrStatisticInfo, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return nil, fmt.Errorf("input invalid logicID when get sio info: %d", logicID)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return nil, fmt.Errorf("failed to get cardID and deviceID by logicID(%d) when get sio info , error: %v",
logicID, err)
}
cgoSPodSioInfo, err := d.DcMgr.DcGetSioInfo(cardID, deviceID)
if err != nil {
return nil, err
}
return &cgoSPodSioInfo, nil
}
func (d *DeviceManager) GetHccsStatisticInfo(logicID int32) (*common.HccsStatisticInfo, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+
"when get hccs statistic info, error: %v", logicID, err)
}
cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfo(cardID, deviceID)
if err != nil {
return buildFailedHccsInfo(), err
}
return &cgoHccsStatusInfo, nil
}
func (d *DeviceManager) GetHccsStatisticInfoInU64(logicID int32) (*common.HccsStatisticInfo, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return buildFailedHccsInfo(), fmt.Errorf("input invalid logicID when get hccs statistic info: %d", logicID)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return buildFailedHccsInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+
"when get hccs statistic info, error: %v", logicID, err)
}
cgoHccsStatusInfo, err := d.DcMgr.DcGetHccsStatisticInfoU64(cardID, deviceID)
if err != nil {
return buildFailedHccsInfo(), err
}
return &cgoHccsStatusInfo, nil
}
func (d *DeviceManager) GetHccsBandwidthInfo(logicID int32) (*common.HccsBandwidthInfo, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return buildFailedHccsBWInfo(), fmt.Errorf("input invalid logicID when get hccs bandwidth info: %d", logicID)
}
cardID, deviceID, err := d.getCardIdAndDeviceId(logicID)
if err != nil {
return buildFailedHccsBWInfo(), fmt.Errorf("failed to get cardID and deviceID by logicID(%d) "+
"when get hccs bandwidth info, error: %v", logicID, err)
}
cgoHccsBandwidthInfo, err := d.DcMgr.DcGetHccsBandwidthInfo(cardID, deviceID, common.HccsBWProfilingTime)
if err != nil {
return buildFailedHccsBWInfo(), fmt.Errorf("failed to get hccs bandwidth info by cardId(%d) deviceID(%d), error: %v",
cardID, deviceID, err)
}
return &cgoHccsBandwidthInfo, nil
}
func buildFailedHccsInfo() *common.HccsStatisticInfo {
errorResult := &common.HccsStatisticInfo{
TxCnt: make([]uint64, 8),
RxCnt: make([]uint64, 8),
CrcErrCnt: make([]uint64, 8),
}
for i := 0; i < 8; i++ {
errorResult.TxCnt[i] = common.FailedValue
errorResult.RxCnt[i] = common.FailedValue
errorResult.CrcErrCnt[i] = common.FailedValue
}
return errorResult
}
func buildFailedHccsBWInfo() *common.HccsBandwidthInfo {
errorResult := &common.HccsBandwidthInfo{
ProfilingTime: uint32(common.HccsBWProfilingTime),
TotalTxbw: common.FailedValue,
TotalRxbw: common.FailedValue,
TxBandwidth: make([]float64, 8),
RxBandwidth: make([]float64, 8),
}
for i := 0; i < 8; i++ {
errorResult.TxBandwidth[i] = common.FailedValue
errorResult.RxBandwidth[i] = common.FailedValue
}
return errorResult
}
func (d *DeviceManager) getCardIdAndDeviceId(logicID int32) (int32, int32, error) {
if !common.IsValidLogicIDOrPhyID(logicID) {
return common.RetError, common.RetError, fmt.Errorf("input invalid logicID: %d", logicID)
}
result, ok := idCache.Load(logicID)
if !ok {
return d.doGetCardIDAndDeviceID(logicID)
}
idMapping, ok := result.(npuIdMapping)
if !ok {
idCache.Delete(logicID)
return d.doGetCardIDAndDeviceID(logicID)
}
hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from cache, cardId:%v, deviceId:%v",
logicID, idMapping.cardId, idMapping.deviceId)
return idMapping.cardId, idMapping.deviceId, nil
}
func (d *DeviceManager) doGetCardIDAndDeviceID(logicID int32) (int32, int32, error) {
cardId, deviceId, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return common.RetError, common.RetError, err
}
hwlog.ResetErrCnt(common.DomainForLogicIdErr, logicID)
hwlog.RunLog.Debugf("get cardId and deviceId by logicID(%d) from dcmi, cardId:%v, deviceId:%v",
logicID, cardId, deviceId)
idCache.Store(logicID, npuIdMapping{logicId: logicID, cardId: cardId, deviceId: deviceId})
return cardId, deviceId, nil
}
func (d *DeviceManager) GetChipBaseInfos() ([]*common.ChipBaseInfo, error) {
_, cardList, err := d.DcMgr.DcGetCardList()
if err != nil {
return nil, fmt.Errorf("get card list failed, error: %v", err)
}
var chips = []*common.ChipBaseInfo{}
for _, cardID := range cardList {
devNumInCard, err := d.DcMgr.DcGetDeviceNumInCard(cardID)
if err != nil {
return nil, fmt.Errorf("get device num by cardID: %d failed, error: %v",
cardID, err)
}
for devID := int32(0); devID < devNumInCard; devID++ {
logicID, err := d.DcMgr.DcGetDeviceLogicID(cardID, devID)
if err != nil {
return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) logic id "+
"failed, error: %v", cardID, devID, err)
}
physicID, err := d.DcMgr.DcGetPhysicIDFromLogicID(logicID)
if err != nil {
return nil, fmt.Errorf("get device (cardID: %d, deviceID: %d) physic id "+"failed, error: %v",
cardID, devID, err)
}
hwlog.RunLog.Infof("get chip base info, cardID: %d, deviceID: %d, logicID: %d, physicID: %d", cardID,
devID, logicID, physicID)
chips = append(chips, &common.ChipBaseInfo{
PhysicID: physicID,
LogicID: logicID,
CardID: cardID,
DeviceID: devID,
})
}
}
return chips, nil
}
func (d *DeviceManager) StartHccsPingMesh(logicID int32, portID int, operate common.HccspingMeshOperate) error {
cardID, deviceID, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return err
}
devType := d.GetDevType()
if devType == common.Ascend910A5 {
return d.DcMgr.DcStartUbPingMesh(cardID, deviceID, operate)
}
return d.DcMgr.DcStartHccsPingMesh(cardID, deviceID, portID, operate)
}
func (d *DeviceManager) StopHccsPingMesh(logicID int32, portID int, taskID uint) error {
cardID, deviceID, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return err
}
devType := d.GetDevType()
if devType == common.Ascend910A5 {
return d.DcMgr.DcStopUbPingMesh(cardID, deviceID, taskID)
}
return d.DcMgr.DcStopHccsPingMesh(cardID, deviceID, portID, taskID)
}
func (d *DeviceManager) GetHccsPingMeshInfo(logicID int32, portID int, taskID uint) (*common.HccspingMeshInfo, error) {
cardID, deviceID, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return nil, err
}
devType := d.GetDevType()
if devType == common.Ascend910A5 {
return d.DcMgr.DcGetUbPingMeshInfo(cardID, deviceID, taskID, common.UbPingMeshMaxNum)
}
return d.DcMgr.DcGetHccsPingMeshInfo(cardID, deviceID, portID, taskID)
}
func (d *DeviceManager) GetHccsPingMeshState(logicID int32, portID int, taskID uint) (int, error) {
cardID, deviceID, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return common.RetError, err
}
devType := d.GetDevType()
if devType == common.Ascend910A5 {
return d.DcMgr.DcGetUbPingMeshState(cardID, deviceID, taskID)
}
return d.DcMgr.DcGetHccsPingMeshState(cardID, deviceID, portID, taskID)
}
func (d *DeviceManager) GetSuperPodStatus(logicID int32, sdid uint32) (int, error) {
cardID, deviceID, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return common.RetError, err
}
var status int
for i := 0; i < maxRetries; i++ {
if status, err = d.DcMgr.DcGetSuperPodStatus(cardID, deviceID, sdid); err != nil {
hwlog.RunLog.Errorf("get super pod status failed, retry %d, cardID: %d, deviceID: %d, "+
"sdid: %d, error: %v", i, cardID, deviceID, sdid, err)
continue
}
break
}
return status, err
}
func (d *DeviceManager) SetSuperPodStatus(logicID int32, sdid, status uint32) error {
cardID, deviceID, err := d.DcMgr.DcGetCardIDDeviceID(logicID)
if err != nil {
hwlog.RunLog.ErrorfWithLimit(common.DomainForLogicIdErr, logicID,
"failed to get cardId and deviceId by logicID(%d), error: %v", logicID, err)
return err
}
for i := 0; i < maxRetries; i++ {
if err = d.DcMgr.DcSetSuperPodStatus(cardID, deviceID, sdid, status); err != nil {
hwlog.RunLog.Errorf("set super pod status failed, retry %d, cardID: %d, deviceID: %d, "+
"sdid: %d, status: %d, error: %v", i, cardID, deviceID, sdid, status, err)
continue
}
break
}
return err
}
func (d *DeviceManager) GetUrmaDeviceCount(logicID int32) (int32, error) {
return 0, nil
}
func (d *DeviceManager) GetUrmaDevEidList(logicID int32, index int32) (*common.UrmaDeviceInfo, error) {
return &common.UrmaDeviceInfo{}, nil
}
func (d *DeviceManager) GetUrmaDevEidListAll(logicID int32) ([]common.UrmaDeviceInfo, error) {
return []common.UrmaDeviceInfo{
{
EidCount: 1,
EidInfos: []common.UrmaEidInfo{
{Eid: common.Eid{Raw: [common.EidByteSize]byte{}}},
},
},
}, nil
}
func (d *DeviceManager) GetMultiDiePolicy() (dcmi.DiePolicyType, error) {
return d.DcMgr.DcGetMultiDiePolicy()
}
func (d *DeviceManager) SetMultiDiePolicy(policy dcmi.DiePolicyType) error {
return d.DcMgr.DcSetMultiDiePolicy(policy)
}