sharding_size: 8
sub_modules_to_wrap:
  - mindspeed_mm.models.omni.mllms.bagel_qwen2_mot.Qwen2MoTDecoderLayer
reshard_after_forward: True
param_dtype: bf16
reduce_dtype: bf16
cast_forward_inputs: True
recompute_modules:
  - mindspeed_mm.models.omni.mllms.bagel_qwen2_mot.Qwen2MoTDecoderLayer