// Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.

// Package api common const
package api

// Env
const (
	NodeNameEnv = "NODE_NAME"

	// PtWorldSizeEnv the total number of npu used for the task for PyTorch
	PtWorldSizeEnv = "WORLD_SIZE"
	// PtLocalWorldSizeEnv number of npu used per pod for PyTorch
	PtLocalWorldSizeEnv = "LOCAL_WORLD_SIZE"
	// PtLocalRankEnv logic id List of npu used by pod for PyTorch
	PtLocalRankEnv = "LOCAL_RANK"

	// TfWorkerSizeEnv the total number of npu used for the task for TensorFlow
	TfWorkerSizeEnv = "CM_WORKER_SIZE"
	// TfLocalWorkerEnv number of npu used per pod for TensorFlow
	TfLocalWorkerEnv = "CM_LOCAL_WORKER"

	// MsWorkerNumEnv the total number of npu used for the task for MindSpore
	MsWorkerNumEnv = "MS_WORKER_NUM"
	// MsLocalWorkerEnv number of npu used per pod for MindSpore
	MsLocalWorkerEnv = "MS_LOCAL_WORKER"
)

// NameSpace
const (
	DLNamespace = "mindx-dl"
	ClusterNS   = "cluster-system"
	KubeNS      = "kube-system"
)

// Node
const (
	// NPUChipMemoryLabel label value is npu chip memory
	NPUChipMemoryLabel = "mind-cluster/npu-chip-memory"

	// NodeSNAnnotation annotation value is node sn
	NodeSNAnnotation = "product-serial-number"
	// BaseDevInfoAnno annotation value is device base info
	BaseDevInfoAnno = "baseDeviceInfos"

	// AcceleratorTypeKey the node label key of accelerator type
	AcceleratorTypeKey = "accelerator-type"
	// AcceleratorLabelKey the node label key of accelerator
	AcceleratorLabelKey = "accelerator"
	// AcceleratorTypeModule910A3x16SuperPod for 16-npu 910A3-SuperPod hardware
	AcceleratorTypeModule910A3x16SuperPod = "module-a3-16-super-pod"
	// AcceleratorTypeModule910A3x8SuperPod for 8-npu 910A3-SuperPod hardware
	AcceleratorTypeModule910A3x8SuperPod = "module-a3-8-super-pod"
)

// Pod
const (
	// PodUsedHardwareTypeAnno annotation value is the hardware type that real used in pod
	PodUsedHardwareTypeAnno = "mind-cluster/hardware-type"
	// PodRankIndexAnno annotation value is rank index of the pod
	PodRankIndexAnno = "hccl/rankIndex"
	// SuperPodIDAnno annotation key of the super pod id
	SuperPodIDAnno = "super-pod-id"

	// Hotswitch Annotations

	// InHotSwitchFlowKey in hot switch flow key
	InHotSwitchFlowKey = "inHotSwitchFlow"
	// InHotSwitchFlowValue in hot switch flow true
	InHotSwitchFlowValue = "true"
	// BackupNewPodNameKey backup new pod name key
	BackupNewPodNameKey = "backupNewPodName"
	// BackupSourcePodNameKey backup source pod name key
	BackupSourcePodNameKey = "backupSourcePodName"
	// NeedOperatorOpeKey need operator ope key
	NeedOperatorOpeKey = "needOperatorOpe"
	// NeedVolcanoOpeKey need volcano ope key
	NeedVolcanoOpeKey = "needVolcanoOpe"
	// OpeTypeDelete ope type delete
	OpeTypeDelete = "delete"
	// OpeTypeCreate ope type create
	OpeTypeCreate = "create"
	// PodTypeKey pod type key
	PodTypeKey = "podType"
	// PodTypeBackup pod type backup
	PodTypeBackup = "backup"
	// DefaultRetryTimes default retry times
	DefaultRetryTimes = 3
	// MasterPodRank master pod rank
	MasterPodRank = "0"
)

const (
	// AtlasTaskLabel label value task kind, eg. ascend-910, ascend-{xxx}b
	AtlasTaskLabel = "ring-controller.atlas"
)

// ConfigMap
const (
	// DeviceInfoCMDataKey device-info-cm data key, record device info
	DeviceInfoCMDataKey = "DeviceInfoCfg"
	// SwitchInfoCMDataKey device-info-cm data key, record switch info
	SwitchInfoCMDataKey = "SwitchInfoCfg"
	// NodeInfoCMDataKey node-info-cm data key, record node info
	NodeInfoCMDataKey = "NodeInfo"
	// PubFaultCMDataKey public fault cm data key, record public fault info
	PubFaultCMDataKey = "PublicFault"

	// CIMCMLabelKey cm label key, who uses these cms
	CIMCMLabelKey = "mx-consumer-cim"
	// PubFaultCMLabelKey public fault cm label key
	PubFaultCMLabelKey = "mc-consumer-publicfault"
)

const (
	// FaultJobCmName fault job cm name
	FaultJobCmName = "fault-job-info"
)

const (
	// PodScheduleLabel pod schedule label
	PodScheduleLabel = "pod-rescheduling"
	// ProcessScheduleLabel process schedule label
	ProcessScheduleLabel = "process-recover-enable"
	// RecoverStrategyKey recover strategy key in job annotation
	RecoverStrategyKey = "recover-strategy"
)

// process schedule strategy
const (
	// RecoverStrategy recover strategy
	RecoverStrategy = "recover"
	// RetryStrategy retry strategy
	RetryStrategy = "retry"
	// InPlaceStrategy recover in place strategy
	InPlaceStrategy = "recover-in-place"
	// DumpStrategy dump strategy
	DumpStrategy = "dump"
	// ExitStrategy exit strategy
	ExitStrategy = "exit"
	// ElasticTraining elastic-training strategy
	ElasticTraining = "elastic-training"
)

// process schedule common env
const (
	// ProcessRecoverEnv process recover env
	ProcessRecoverEnv = "PROCESS_RECOVER"
	// ElasticRecoverEnv elastic process recover env
	ElasticRecoverEnv = "ELASTIC_PROCESS_RECOVER_ENABLE"
	// EnableRestartEnv enable restart env
	EnableRestartEnv = "ENABLE_RESTART_FAULT_PROCESS"
)

// process schedule pytorch env
const (
	// HighAvailableEnv high available env
	HighAvailableEnv = "HIGH_AVAILABILITY"
	// PtCloseWatchDogKey pt close watch dog key
	PtCloseWatchDogKey = "HCCL_ASYNC_ERROR_HANDLING"
	// PtCloseWatchDogValue pt close watch dog value
	PtCloseWatchDogValue = "0"
)

// process schedule ms env
const (
	// MsRecoverEnv ms recover env
	MsRecoverEnv = "MS_ENABLE_TFT"
	// EnableMS enable ms
	EnableMS = "MINDIO_FOR_MINDSPORE"
	// MsDumpStrategy ms dump strategy
	MsDumpStrategy = "TTP:1"
	// MsUceStrategy ms uce strategy
	MsUceStrategy = "UCE:1"
	// MsArfStrategy ms arf strategy
	MsArfStrategy = "ARF:1"
	// MsHcceStrategy ms hcce strategy
	MsHcceStrategy = "HCCE:1"
	// MsRscStrategy ms rsc strategy
	MsRscStrategy = "RSC:1"
	// MsCloseWatchDogKey ms close watch dog key
	MsCloseWatchDogKey = "MS_ENABLE_THM"
	// MsCloseWatchDogValue ms close watch dog value
	MsCloseWatchDogValue = `{HCCL_WATCHDOG:0}`
)

const (
	// EnableFunc Enable Func
	EnableFunc = "on"
	// EnableFlag enable flag
	EnableFlag = "1"
	// PytorchFramework framework
	PytorchFramework = "pytorch"
	// MindSporeFramework framework
	MindSporeFramework = "mindspore"
)

const (
	// RescheduleInPlaceKey reschedule in place key
	RescheduleInPlaceKey = "reschedule-in-place"
	// RescheduleInPlaceValue reschedule in place value
	RescheduleInPlaceValue = "true"
)

const (
	// DeviceResetTimeout device reset timeout
	DeviceResetTimeout = "deviceResetTimeout"
	// DefaultDeviceResetTimeout default device reset timeout is 60 seconds
	DefaultDeviceResetTimeout = 60
	// MinDeviceResetTimeout min device reset timeout is 10 seconds
	MinDeviceResetTimeout = 10
	// MaxDeviceResetTimeout max device reset timeout is 600 seconds
	MaxDeviceResetTimeout = 600
)

const (
	// SubHealthyStrategy config in pod group label for subHealthy fault strategy
	SubHealthyStrategy = "subHealthyStrategy"
	// SubHealthyHotSwitch strategy name of hot switch
	SubHealthyHotSwitch = "hotSwitch"
)

const (
	// MinAvailableKey decide minAvailable of task
	MinAvailableKey = "huawei.com/schedule_minAvailable"
	// SchedulePolicyAnnoKey annotation key for schedule policy
	SchedulePolicyAnnoKey = "huawei.com/schedule_policy"
	// AffinityConfigAnnoKey annotation key for multilevel schedule policy
	AffinityConfigAnnoKey = "huawei.com/affinity-config"
)

// for cm
const (
	// CmRecoveringSuffix Recovering Suffix
	CmRecoveringSuffix = "-Recovering"
	// CmCardUnhealthySuffix CardUnhealthy Suffix
	CmCardUnhealthySuffix = "-Unhealthy"
	// CmCardNetworkUnhealthySuffix NetworkUnhealthy Suffix
	CmCardNetworkUnhealthySuffix = "-NetworkUnhealthy"
	// CmFaultListSuffix FaultList Suffix
	CmFaultListSuffix = "-Fault"
)

// node labels
const (
	// NodeLabelRecoverSuffix "-Recover"
	NodeLabelRecoverSuffix = "-Recover"
	// NodeLabelNetworkRecoverSuffix "-NetworkRecover"
	NodeLabelNetworkRecoverSuffix = "-NetworkRecover"
)

const (
	// SoftShareDeviceCount is the number of soft share device
	SoftShareDeviceCount = 100
	// SoftShareDeviceMaxAICoreQuota is the max aicore quota of soft share device
	SoftShareDeviceMaxAICoreQuota = 100
	// SoftShareDeviceConfigDir is the directory containing the config file of soft share device
	SoftShareDeviceConfigDir = "/etc/enpu/"
	// SoftShareDeviceConfigFileName is the name of soft share device config file
	SoftShareDeviceConfigFileName = "npu_info.config"

	// SchedulerSoftShareDevAicoreQuotaKey key for aicore quota of soft share device task
	SchedulerSoftShareDevAicoreQuotaKey = "huawei.com/scheduler.softShareDev.aicoreQuota"
	// SchedulerSoftShareDevHbmQuotaKey key for hbm quota of soft share device task
	SchedulerSoftShareDevHbmQuotaKey = "huawei.com/scheduler.softShareDev.hbmQuota"
	// SchedulerSoftShareDevPolicyKey key for policy of soft share device task
	SchedulerSoftShareDevPolicyKey = "huawei.com/scheduler.softShareDev.policy"
	// SchedulerSoftShareDevVNPUIdKey key for vnpu id of soft share device task
	SchedulerSoftShareDevVNPUIdKey = "huawei.com/scheduler.softShareDev.vnpuId"

	// SoftShareDeviceConfigPhysicalNPUId decide soft share device config physical npu id
	SoftShareDeviceConfigPhysicalNPUId = "physical-npu-id"
	// SoftShareDeviceConfigVirtualNPUId decide soft share device config virtual npu id
	SoftShareDeviceConfigVirtualNPUId = "virtual-npu-id"
	// SoftShareDeviceConfigAICoreQuota decide soft share device config aicore quota
	SoftShareDeviceConfigAICoreQuota = "aicore-quota"
	// SoftShareDeviceConfigHbmQuota decide soft share device config hbm quota
	SoftShareDeviceConfigHbmQuota = "memory-quota"
	// SoftShareDeviceConfigShmId decide soft share device config shm id
	SoftShareDeviceConfigShmId = "shm-id"
	// SoftShareDeviceConfigSchedulingPolicy decide soft share device config scheduling policy
	SoftShareDeviceConfigSchedulingPolicy = "scheduling-policy"

	// DefaultSoftShareDeviceConfigPerm default perm for soft share device config
	DefaultSoftShareDeviceConfigPerm = 0644
	// DefaultSoftShareDeviceConfigDirPerm default perm for soft share device config dir
	DefaultSoftShareDeviceConfigDirPerm = 0755

	// SoftShareDeviceSchedulingPolicyFixedShare scheduling policy for fixed share
	SoftShareDeviceSchedulingPolicyFixedShare = "fixed-share"
	// SoftShareDeviceSchedulingPolicyElastic scheduling policy for elastic share
	SoftShareDeviceSchedulingPolicyElastic = "elastic"
	// SoftShareDeviceSchedulingPolicyBestEffort scheduling policy for best effort
	SoftShareDeviceSchedulingPolicyBestEffort = "best-effort"
	// SoftShareDeviceSchedulingPolicyFixedShareInt scheduling policy for fixed share
	SoftShareDeviceSchedulingPolicyFixedShareInt = "1"
	// SoftShareDeviceSchedulingPolicyElasticInt scheduling policy for elastic share
	SoftShareDeviceSchedulingPolicyElasticInt = "2"
	// SoftShareDeviceSchedulingPolicyBestEffortInt scheduling policy for best effort
	SoftShareDeviceSchedulingPolicyBestEffortInt = "3"
)

const (
	// DeviceIPErrorCodeStr is the error device IP, it will due to ranktable generator error in version 1.0 and 1.2
	DeviceIPErrorCodeStr = "-1"
	// DeviceIPEmptyCodeStr is the empty device IP, this does not affect ranktable generation
	DeviceIPEmptyCodeStr = ""
	// DeviceIPDefaultCodeStr is the default device IP, this does not affect ranktable generation
	DeviceIPDefaultCodeStr = "127.0.0.1"
)