mindx-dl-deviceinfo-node1:
CmName: "mindx-dl-deviceinfo-node1"
DeviceList:
huawei.com/Ascend910-Fault: |
[
{ "fault_type": "uce fault type", "npu_name": "Ascend910-0", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 90000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-1", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 89000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-2", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 89000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-3", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 50000, "fault_level": "RestartBusiness" } } }
]
mindx-dl-deviceinfo-node2:
CmName: "mindx-dl-deviceinfo-node2"
DeviceList:
huawei.com/Ascend910-Fault: |
[
{ "fault_type": "uce fault type", "npu_name": "Ascend910-0", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 100000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-1", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 70000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-2", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 84000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-3", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 40000, "fault_level": "RestartBusiness" } } }
]
---
mindx-dl-deviceinfo-node1:
CmName: "mindx-dl-deviceinfo-node1"
DeviceList:
huawei.com/Ascend910-Fault: |
[
{ "fault_type": "uce fault type", "npu_name": "Ascend910-1", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 89000, "fault_level": "RestartBusiness" } } }
]
mindx-dl-deviceinfo-node2:
CmName: "mindx-dl-deviceinfo-node2"
DeviceList:
huawei.com/Ascend910-Fault: |
[
{ "fault_type": "uce fault type", "npu_name": "Ascend910-1", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 70000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-2", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 84000, "fault_level": "RestartBusiness" } } },
{ "fault_type": "uce fault type", "npu_name": "Ascend910-3", "fault_code": "80E01801",
"fault_time_and_level_map": { "80E01801": { "fault_time": 40000, "fault_level": "RestartBusiness" } } }
]
---
job1:
node1:
device:
- device_id: 0
server_name: node1
node2:
device:
- device_id: 0
server_name: node2
job2:
node1:
device:
- device_id: 1
server_name: node1
node2:
device:
- device_id: 1
server_name: node2
job3:
node1:
device:
- device_id: 2
server_name: node1
node2:
device:
- device_id: 2
server_name: node2
job4:
node1:
device:
- device_id: 3
server_name: node1
node2:
device:
- device_id: 3
server_name: node2
---
job1: true
job2: true
job3: true
job4: true
job5: true
---
RetryMap:
job3:
node1:
Ascend910-2:
RecoverTime: 95000
ReportTime: 9223372036854775807
node2:
Ascend910-2:
RecoverTime: 95000
ReportTime: 9223372036854775807
job4:
node1:
Ascend910-3:
RecoverTime: 60000
CompleteTime: 90000
ReportTime: 9223372036854775807
Ascend910-1:
RecoverTime: 101000
CompleteTime: 102000
ReportTime: 9223372036854775807
node2:
Ascend910-3:
RecoverTime: 60000
CompleteTime: 90000
ReportTime: 9223372036854775807
Ascend910-1:
RecoverTime: 101000
CompleteTime: 102000
ReportTime: 9223372036854775807