文件最后提交记录最后更新时间
1 个月前
5 个月前
5 个月前
20 天前
5 个月前
README.md

自定义插件开发说明

用户可参考提供的demo,或将代码拷贝到plugins目录下,重新编译部署,下面对demo中各文件进行说明

  • dcmi.godcmi_interface_api.h:用户自定义NPU指标的接口声明与cgo实现,用于对接驱动dcmi接口,具体可参考demo实现,全部dcmi接口续参考驱动的dcmi接口文档。
  • custom_metrics.go 实现MetricCollector的接口,用于指标采集与上报,需要实现下面的接口,具体可参考demo实现:
    • Describe:prometheus上报指标前,需要先定义指标的,该接口用于prometheus的指标定义
    • CollectToCache: 指标采集方法,每个采集周期都会执行,从外部获取数据,并传入到内部缓存中
    • UpdatePrometheus: 按照prometheus的格式,将缓存中的数据返回
    • UpdateTelagraf:按照telagraf的格式,将缓存中的数据返回。
    • IsSupporterd:检测当前环境,判断是否支持当前设备的检测。
    • PreCollect:正式开始采集前执行一次,可用于设备初始化。可以为空。
    • PostCollect:采集结束后执行一次,可用于数据的回收。可以为空。
  • register.go,提供插件注册函数,在npu-exporter启动时完成插件注册并完成dcmi接口初始化,RegisterPlugin函数签名不要修改,在RegisterPlugin函数中通过registerPlugin(插件名称, &插件类{})完成注册,指标名称需要与pluginConfiguration.json中的指标组名称保持一致

对于插件指标组内定义的指标名称,不要与现有代码中已定义的插件指标(当前NPU指标、插件指标)重名

自定义插件采集时间超过10s后,npu-exporter会打印日志,提示插件采集时间过长,执行下一个插件采集。

编译部署

插件开发完后,执行Npu-exporter代码目录下的build/build.sh完成编译,需要提前准备go开发环境。

编译完成后,会在output目录下生成新的二进制文件与相关配置文件,根据需要打开或关闭相应开关,根据安装部署章节的安装指导,重新作镜像部署即可

dcmi.go

/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package plugins this for dcmi interface
package plugins

// #cgo LDFLAGS: -ldl
/*
   #include <stddef.h>
   #include <dlfcn.h>
   #include <stdlib.h>
   #include <stdio.h>

   #include "dcmi_interface_api.h"

   static void *dcmiHandle;
   #define SO_NOT_FOUND  -99999
   #define FUNCTION_NOT_FOUND  -99998
   #define SUCCESS  0
   #define ERROR_UNKNOWN  -99997
   #define	CALL_FUNC(name,...) if(name##_func==NULL){return FUNCTION_NOT_FOUND;}return name##_func(__VA_ARGS__);

   static int (*dcmi_get_device_health_func)(int card_id, int device_id, unsigned int *health);
   int dcmi_get_device_health(int card_id, int device_id, unsigned int *health){
   	CALL_FUNC(dcmi_get_device_health,card_id,device_id,health)
   }

   // load .so files and functions
   static int dcmiLoad_dl(const char* dcmiLibPath){
   	if (dcmiLibPath == NULL) {
   	   	fprintf (stderr,"lib path is null\n");
   	   	return SO_NOT_FOUND;
   	}
   	dcmiHandle = dlopen(dcmiLibPath,RTLD_LAZY | RTLD_GLOBAL);
   	if (dcmiHandle == NULL){
   		fprintf (stderr,"%s\n",dlerror());
   		return SO_NOT_FOUND;
   	}

	dcmi_get_device_health_func = dlsym(dcmiHandle,"dcmi_get_device_health");

   	return SUCCESS;
   }

   static int dcmiShutDown(void){
   	if (dcmiHandle == NULL) {
   		return SUCCESS;
   	}
   	return (dlclose(dcmiHandle) ? ERROR_UNKNOWN : SUCCESS);
   }
*/
import "C"
import (
	"fmt"

	"unsafe"

	"ascend-common/common-utils/utils"
	"ascend-common/devmanager/common"
)

const (
	dcmiLibraryName = "libdcmi.so"
)

// DcLoad load dcmi symbol
func DcLoad() error {
	dcmiLibPath, err := utils.GetDriverLibPath(dcmiLibraryName)
	if err != nil {
		return err
	}
	cDcmiTemplateName := C.CString(dcmiLibPath)
	defer C.free(unsafe.Pointer(cDcmiTemplateName))
	if retCode := C.dcmiLoad_dl(cDcmiTemplateName); retCode != C.SUCCESS {
		return fmt.Errorf("dcmi lib load failed, error code: %d", int32(retCode))
	}
	return nil
}

// DcShutDown clean the dynamically loaded resource
func DcShutDown() error {
	if retCode := C.dcmiShutDown(); retCode != C.SUCCESS {
		return fmt.Errorf("dcmi shut down failed, error code: %d", int32(retCode))
	}

	return nil
}

// DcGetDeviceHealth get device health
func DcGetDeviceHealth(cardID, deviceID int32) (int32, error) {
	if !common.IsValidCardIDAndDeviceID(cardID, deviceID) {
		return common.RetError, fmt.Errorf("cardID(%d) or deviceID(%d) is invalid", cardID, deviceID)
	}
	var health C.uint
	if retCode := C.dcmi_get_device_health(C.int(cardID), C.int(deviceID),
		&health); int32(retCode) != common.Success {
		return common.RetError, fmt.Errorf("get device (cardID: %d, deviceID: %d) health state failed, ret "+
			"code: %d, health code: %d", cardID, deviceID, int32(retCode), int64(health))
	}
	if common.IsGreaterThanOrEqualInt32(int64(health)) {
		return common.RetError, fmt.Errorf("get wrong health state , device (cardID: %d, deviceID: %d) "+
			"health: %d", cardID, deviceID, int64(health))
	}
	return int32(health), nil
}

dcmi_interface_api.h

/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

#ifndef __DCMI_INTERFACE_API_H__
#define __DCMI_INTERFACE_API_H__

#ifdef __cplusplus
#if __cplusplus
extern "C" {
#endif
#endif /* __cplusplus */

#define DCMIDLLEXPORT static

DCMIDLLEXPORT int dcmi_get_device_health(int card_id, int device_id, unsigned int *health);

#ifdef __cplusplus
#if __cplusplus
}
#endif
#endif /* __cplusplus */

#endif /* __DCMI_INTERFACE_API_H__ */

custom_metrics.go

/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package plugins for custom metrics
package plugins

import (
	"sync"
	"time"

	"github.com/prometheus/client_golang/prometheus"

	"huawei.com/npu-exporter/v6/collector/common"
	"huawei.com/npu-exporter/v6/collector/container"
	"huawei.com/npu-exporter/v6/utils"
	"huawei.com/npu-exporter/v6/utils/logger"
)

var (
	PluginInfoDesc = prometheus.NewDesc("plugin_info", "exporter custom plugin info",
		[]string{"plugin_label"}, nil)

	PluginNpuInfoDesc = prometheus.NewDesc("npu_plugin_info", "exporter custom npu plugin info",
		[]string{"npu_plugin_label"}, nil)
)

const (
	pluginInfoKey    = "pluginInfoKey"
	pluginInfoValue  = 1.11111
	pluginLabel      = "pluginLabel"
	npuPluginLabel   = "npuPluginInfoKey"
	npuPluginInfoKey = "npuPluginInfoKey"
	pluginName       = "myPlugin"
)

// PluginInfoCollector collect custom plugin info
type PluginInfoCollector struct {
	common.MetricsCollectorAdapter
	Cache sync.Map
}

// Describe description of the metric
func (c *PluginInfoCollector) Describe(ch chan<- *prometheus.Desc) {
	// add desc
	logger.Debug("PluginInfoCollector Describe")
	ch <- PluginInfoDesc
	ch <- PluginNpuInfoDesc
}

// CollectToCache collect the metric to cache
func (c *PluginInfoCollector) CollectToCache(n *common.NpuCollector, chipList []common.HuaWeiAIChip) {
	// collect metric to cache
	logger.Debug("PluginInfoCollector CollectToCache")
	c.Cache.Store(pluginInfoKey, pluginInfoValue)
	health, err := DcGetDeviceHealth(0, 0)
	if err != nil {
		logger.Error(err)
		return
	}
	c.Cache.Store(npuPluginInfoKey, health)
}

// UpdatePrometheus update prometheus metric
func (c *PluginInfoCollector) UpdatePrometheus(ch chan<- prometheus.Metric, n *common.NpuCollector,
	containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) {
	logger.Debug("PluginInfoCollector UpdatePrometheus")
	// get metric from cache
	pluginCache, _ := c.Cache.Load(pluginInfoKey)
	npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey)
	// update plugin info
	ch <- prometheus.NewMetricWithTimestamp(time.Now(),
		prometheus.MustNewConstMetric(PluginInfoDesc, prometheus.GaugeValue, pluginCache.(float64), pluginLabel))
	// update npu plugin info
	value := float64(npuPluginCache.(int32))
	ch <- prometheus.NewMetricWithTimestamp(time.Now(),
		prometheus.MustNewConstMetric(PluginNpuInfoDesc, prometheus.GaugeValue, value, npuPluginLabel))

}

// UpdateTelegraf update telegraf metric
func (c *PluginInfoCollector) UpdateTelegraf(fieldsMap map[string]map[string]interface{}, n *common.NpuCollector,
	containerMap map[int32]container.DevicesInfo, chips []common.HuaWeiAIChip) map[string]map[string]interface{} {
	logger.Debug("PluginInfoCollector UpdateTelegraf")
	// get metric from cache
	pluginCache, _ := c.Cache.Load(pluginInfoKey)
	npuPluginCache, _ := c.Cache.Load(npuPluginInfoKey)
	// update plugin info
	if fieldsMap[common.GeneralDevTagKey] == nil {
		fieldsMap[common.GeneralDevTagKey] = make(map[string]interface{})
	}
	utils.DoUpdateTelegraf(fieldsMap[common.GeneralDevTagKey], PluginInfoDesc, pluginCache.(float64), "")
	// update npu plugin info
	const NpuLogicID = "1"
	value := float64(npuPluginCache.(int32))
	if fieldsMap[NpuLogicID] == nil {
		fieldsMap[NpuLogicID] = make(map[string]interface{})
	}
	utils.DoUpdateTelegraf(fieldsMap[NpuLogicID], PluginNpuInfoDesc, value, "")
	return fieldsMap
}

// PreCollect pre handle before collect
func (c *PluginInfoCollector) PreCollect(n *common.NpuCollector, chipList []common.HuaWeiAIChip) {
	logger.Debug("PluginInfoCollector PreCollect")
}

// PostCollect post handle after collect
func (c *PluginInfoCollector) PostCollect(n *common.NpuCollector) {
	logger.Debug("PluginInfoCollector PostCollect")
}

// IsSupported Check whether the current hardware supports this metric
func (c *PluginInfoCollector) IsSupported(n *common.NpuCollector) bool {
	logger.Debug("PluginInfoCollector IsSupported")
	return true
}

register.go

/* Copyright(C) 2025. Huawei Technologies Co.,Ltd. All rights reserved.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/

// Package plugins for custom metrics
package plugins

import (
  "huawei.com/npu-exporter/v6/collector/common"
  "huawei.com/npu-exporter/v6/collector/config"
  "huawei.com/npu-exporter/v6/utils/logger"
)

// RegisterPlugin register plugin collector
func RegisterPlugin() {
  registerPlugin("text", &TextMetricsInfoCollector{})
  // Add custom plugins to the plugins slice here
  // add DcLoad() if you want to use dcmi to get npu info
  err := DcLoad()
  if err != nil {
    logger.Errorf("dcmi init failed: %v\n", err)
    return
  }
  // pluginName should be consistent with the name in pluginConfiguration.json
  registerPlugin("myPlugin", &PluginInfoCollector{})
}

func registerPlugin(pluginName string, c common.MetricsCollector) {
  err := config.AddPluginCollector(pluginName, c)
  if err != nil {
    logger.Errorf("%v", err)
  }
}