{
"communication": {
"bandwidth": "带宽",
"transitSize": "传输大小",
"transportType": "链路方式",
"transitTime": "传输时长",
"operatorName": "算子名称",
"bandwidth in Comparison": "比对带宽",
"transitSize in Comparison": "比对传输大小",
"transportType in Comparison": "比对链路方式",
"transitTime in Comparison": "比对传输时长",
"operatorName in Comparison": "比对算子名称",
"bandwidth in Baseline": "基线带宽",
"transitSize in Baseline": "基线传输大小",
"transportType in Baseline": "基线链路方式",
"transitTime in Baseline": "基线传输时长",
"operatorName in Baseline": "基线算子名称",
"Packet Number": "包数量",
"Bandwidth(GB/s)": "带宽(GB/s)",
"Packet Size(MB)": "包大小(MB)",
"Find in Timeline": "跳转至时间线视图",
"Find in Communication": "跳转至通信算子缩略图",
"Show in Timeline": "在时间线中显示",
"Show in Communication": "在缩略图中显示",
"Align according to selected operator": "根据选中算子对齐",
"Restore default state": "恢复默认状态",
"sessionTitle": {
"Communication": "通信算子缩略图",
"Potential Slow Rank List": "可能的慢卡列表",
"MatrixModel": "通信矩阵",
"VisualizedCommunicationTime": "通信时长",
"DataAnalysisCommunicationTime": "通信时长数据分析",
"PacketDistribution": "通信包分析",
"BandwidthAnalysis": "带宽分析"
},
"searchCriteria": {
"Cluster": "集群",
"Step": "迭代ID",
"BaselineStep": "基线迭代ID",
"CommunicationGroup": "通信域",
"OperatorName": "算子名称",
"CommunicationMatrix": "通信矩阵",
"CommunicationDurationAnalysis": "通信耗时分析",
"CommunicationMatrixType": "通信矩阵类型",
"ShowInnerCommunication": "显示卡内通信",
"VisibleRange": "筛选范围",
"Bandwidth": "带宽",
"TransitSize": "传输大小",
"TransportType": "链路方式",
"TransitTime": "传输时长"
},
"tableHead": {
"Operator Name": "算子名称",
"Rank ID": "卡序号",
"Start Time": "开始时间",
"Elapse Time": "总时间",
"Transit Time": "传输时间",
"Synchronization Time": "同步时间",
"Wait Time": "等待时间",
"Synchronization Time Ratio": "同步时间占比",
"Wait Time Ratio": "等待时间占比",
"Operator Name in Comparison": "比对算子名称",
"Start Time in Comparison": "比对开始时间",
"Elapse Time in Comparison": "比对总时间",
"Transit Time in Comparison": "比对传输时间",
"Synchronization Time in Comparison": "比对同步时间",
"Wait Time in Comparison": "比对等待时间",
"Synchronization Time Ratio in Comparison": "比对同步时间占比",
"Wait Time Ratio in Comparison": "比对等待时间占比",
"Operator Name in Baseline": "基线算子名称",
"Start Time in Baseline": "基线开始时间",
"Elapse Time in Baseline": "基线总时间",
"Transit Time in Baseline": "基线传输时间",
"Synchronization Time in Baseline": "基线同步时间",
"Synchronization Time Ratio in Baseline": "基线同步时间占比",
"Wait Time Ratio in Baseline": "基线等待时间占比",
"Wait Time in Baseline": "基线等待时间",
"Idle Time": "空闲时间",
"SDMABW": "SDMA带宽",
"RDMABW": "RDMA带宽",
"Bandwidth Analysis": "带宽分析",
"Large Packet Ratio": "大通信包占比",
"Communication Operators Details": "通信算子详情",
"see more": "查看更多",
"Details": "详情",
"Source": "来源",
"Category": "类别",
"Issue": "异常",
"Small Size Duration(ms)": "小包持续时长(毫秒)",
"Small Size Proportion(%)": "小包占比(%)",
"Small Size Proportion Standard(%)": "小包占比正常值(%)",
"Small Size Standard(MB)": "小包标准(MB)",
"Abnormal Operator Count": "异常算子数量",
"Small Size(Byte)": "数据量小值(Byte)",
"bandwidth(GB/s)": "带宽(GB/s)",
"duration(us)": "持续时间(微秒)",
"name": "算子名称",
"group name": "通信域名称",
"rankId": "卡序号",
"Operation": "操作"
},
"tableHeadTooltip": {
"Elapse Time": "通信算子所有事件消耗时间之和。公式:总时间 = 通信Events时间之和",
"Transit Time": "统计SDMA链路(server内通信)和RDMA链路(server间通信)的通信算子总耗时。通信时间 = ReduceInline + Memcpy + RDMA通信时间",
"Synchronization Time": "卡间第一次通信前的同步时间,用来区分等待时间过长是慢节点还是慢链路造成的。公式:通信开始前等待时间:第一次SDMA或者RDMA通信前的NotifyWait相加",
"Wait Time": "卡间进行通信前,首先会进行同步,确保通信的两张卡同步完成,再进行通信。等待时间计算方式为统计所有Notify Wait算子总耗时并减去RDMA链路通信时间中的Notify Wait算子耗时。公式:等待时间 = NotifyWait– RDMA通信时间中的NotifyWait",
"Synchronization Time Ratio": "同步时间占比 = (同步时间)/(传输时间 + 同步时间)",
"Wait Time Ratio": "等待时间占比 = (等待时间)/(传输时间 + 等待时间)",
"Idle Time": "空闲时间 = 总时间 – 传输时间 – 等待时间"
},
"Advice": "专家建议",
"AdviceTip": "此处带宽为通信数据大小与通信时间的比值,非物理带宽",
"Overall": "总览",
"OverallDuration": "{{type}}总耗时为{{time}}毫秒。",
"MoreFocus": "请关注{{type}}。",
"CommunicationAdvice": "{{type}}带宽(GB/秒):平均值为{{avg}}GB/秒,最大值为{{max}}GB/秒,最小值为{{min}}GB/秒,差值为{{diff}}GB/秒。{{issue}},请关注上方表格带宽分析列。",
"BandwidthIssue": "带宽是瓶颈",
"CommunicationIssue": "通信效率低",
"Back": "返回",
"Tooltip": "导入集群数据中若已包含cluster_analysis_output文件夹,则不会对其进行覆盖;如果出现集群数据和单卡数据不对应,请删除原有cluster_analysis_output文件夹及cluster.db文件并重新导入。",
"Difference": "差值",
"Same": "相同",
"Different": "不同",
"Packet Analysis": "小包分析",
"Byte Alignment Analysis": "字节对齐分析",
"Bandwidth Contention Analysis": "带宽抢占分析",
"Communication Retransmission Analysis": "通信重传分析",
"title": {
"Advice": "建议",
"Data": "数据",
"Data Parallelism": "数据并行建议:",
"Memory Optimization": "检查内存优化策略:",
"Adopt": "适配亲和优化器或融合算子:",
"Byte Alignment Analysis": "调整数据大小:",
"Bandwidth Contention": "基线数据对比:",
"RDMA Transmission Time": "检查RDMA传输时长:",
"Network Configuration": "检查网络配置:"
},
"index": {
"No problematic operators": "暂未发现问题算子",
"Data Parallelism": "如果异常通信集中在数据并行域,1.增加批量大小;2.增加梯度累积。",
"Memory Optimization": "如果内存优化策略为Zero3,建议在内存条件允许的情况下将其设置为Zero2/Zero1。",
"Adopt": "使用亲和优化器或融合算子可以减少通信算子的数量。",
"Byte Alignment Analysis": "请调整数据大小,对齐通信算子的数据量。",
"Bandwidth Contention": "比较启用融合功能之前和之后的性能数据,以评估是否带宽抢占的影响超过了融合的收益。",
"RDMA Transmission Time": "检查怀疑要重新传输的RDMA算子的传输时间是否正确。",
"Network Configuration": "检查交换机和计算节点服务器的网络配置。"
},
"tooltip": {
"Packet Analysis": "通信小包是指通信算子的传输大小小于小包标准,小包占比是指是小包的通信算子数量与所有通信算子数量的比值,小包持续时长是指是小包的通信算子的持续时长之和。",
"Byte Alignment Analysis": "如果一个通信大算子含有的某个Memcpy类或者Reduce_Inline类通信小算子传输方式为SDMA,传输大小大于512字节且不能被512字节整除,称该通信大算子存在字节对齐问题。",
"Bandwidth Contention Analysis": "如果通信算子的[开始时间, 结束时间]和某个matmul算子的[开始时间, 结束时间]有重叠,且它的SDMA带宽小于14.4GB/s,则认为该通信算子带宽抢占。",
"Communication Retransmission Analysis": "通信重传的对象是通信域,不考虑Total Op算子,如果通信域存在某个通信算子,则对所有的rank取最小持续时间和最大RDMA传输时间,如果最小持续时间大于4000ms且最大RDMA传输时间大于4000m,则认定存在通信重传问题。"
},
"OperatorNameTooltip": [
"为方便查看,会对通信算子进行聚类统计,例如 allreduce-total 等分类:",
"total:表示该类算子的平均带宽水平(某类通信算子的总传输量 / 总传输时间),推荐优先查看",
"top:带宽最高的通信算子,topN 即为前 N 高",
"middle:带宽处于中位数的通信算子",
"bottom:带宽最低的通信算子,bottomN 即为前 N 低",
"在分类不为 total 时,将鼠标悬停在热力图上可查看具体通信算子名称"
],
"slowRankList": {
"Advice": "专家建议",
"TitleTooltip": [
"“快慢卡”是相对的概念,快卡即集群中率先完成计算任务的卡。快慢卡不同步,一般表现为快卡存在耗时较长的通信算子,且等待时间占比较高。",
"若想进一步定位造成以上快慢卡差异原因,可在上方的通信算子缩略图中右键相应通信算子,点击 “跳转至时间线视图”,通过对比快慢卡,确认差异根源。"
],
"SlowRankDesc": "在当前通信域内,存在较明显快慢卡问题。请关注与快卡通信时长差异较大的卡与通信算子,其差值可能存在一定优化空间。最快卡ID:{{fastRankId}},最快卡耗时:{{fastTotalElapseTime}}ms",
"NoSlowRankDesc": "当前通信域各卡通信时间较为均衡,未发现明显慢卡。",
"Index": "排序",
"Rank ID": "卡号",
"Action": "操作",
"Operator Name": "算子名称",
"Elapse Time(ms)": "耗时(ms)",
"Elapse Time Difference(ms)": "耗时差值(ms)",
"Elapse Time on Current Rank(ms)": "在当前卡中的耗时(ms)",
"Elapse Time on Fastest Rank(ms)": "在最快卡中的耗时(ms)",
"Find in Communication": "在图中高亮",
"RankTable": {
"tooltip":{
"ElapseTimeDifference": "当前卡与最快卡通信算子总耗时的差值(= 最快卡耗时 - 当前卡耗时),可一定程度反映可减少的通信时间",
"ElapseTime": "卡内的通信算子总耗时",
"FastestRankElapseTime": "最快卡内的通信算子总耗时"
}
},
"ExpandedTable": {
"tooltip":{
"ElapseTimeDifference": "该算子在当前卡与最快卡上的耗时差值(= 该算子在最快卡耗时 - 该算子在当前卡耗时),可一定程度反映可减少的通信时间(假设通信均衡)",
"ElapseTimeOnCurrentRank": "该算子在当前卡内的耗时",
"ElapseTimeOnFastestRank": "该算子在最快卡内的耗时"
}
}
}
}
}