[OP_MASK]
FA_MASK = flash_attention,fusion_attention,flashattn,xformers_flash,efficient_attention,flash2attn
CONV_MASK = aten::conv
MATMUL_MASK = aten::addmm,aten::bmm,aten::mm,aten::matmul
CUBE_MASK = gemm,conv,cutlass,wgrad,gemvx
TRANS_MASK = cast,transdata,transpose
MC2_KERNEL = allgathermatmul,matmulreducescatter,matmulallreduce