dispatch
combine
dispatch_doubleplane
combine_doubleplane
scripts/run.sh
expand_x
assist_info_for_combine
ep_recv_count
expert_token_nums
x_out
[2/4/8]
[int32_t/float16_t]
[bs/topk/h/expertPerPe]
pes
bs
h
topk
expertPerPe
--perf
full_op
comm_only
examples/shmem_perftest/udma_perftest/
mte_perftest
udma_perftest
aclshmemx_udma_put_nbi
aclshmemx_udma_get_nbi
aclshmemx_udma_put_signal_nbi
put / bi_put / get / bi_get / put_signal
main.cpp
udma_perftest_kernel.cpp
run.sh
CMakeLists.txt
README.md
mte_perftest -> shmem_perftest
inner -> mte_perftest
shmem_perftest
DEVICE_SIDE
HOST_SIDE
block_dim=1
-b/--block-size
--block-range
--metric bw
prof_start → loop(*_nbi) → quiet → prof_end
quiet
--metric lat
prof_start → loop(put_nbi) → prof_end → quiet
SHMEMI_PROF_START/END
loop_count
pipe_barrier
--batch
--batch 0
--batch <loop_count>
--batch 1
*_nbi
--batch N
1 < N < loop_count
loop_count % N != 0
prof_end
put_signal
put_signal_nbi
signal_base + warmup + loop_count - 1
write_notify
src/device/gm2gm/engine/shmem_device_udma.hpp
bash scripts/build.sh -examples -soc_type Ascend950
./run.sh -t put -d float --exponent-range 8 17 --loop-count 1000
-t bi_put / get / bi_get / put_signal
--metric lat -t put
--batch 1 / --batch 16 / --batch 1000
bw
--batch -1
--batch abc
-t get
aclshmemi_kernel_abort
examples/shmem_perftest/udma_perftest/README.md
examples/shmem_perftest/README.md
mte_perftest/README.md
ascendc_perftest/README.md