HF_MODEL_LOAD_PATH: &HF_MODEL_LOAD_PATH ./ckpt/hf_path/Qwen3-VL-30B-A3B-Instruct
MM_MODEL_LOAD_PATH: &MM_MODEL_LOAD_PATH ./ckpt/mm_path/Qwen3-VL-30B-A3B-Instruct
DATASET_PATH: &DATASET_PATH ./data/mllm_format_llava_instruct_data.json
FSDP2_PATH: &FSDP2_PATH ./examples/qwen3vl/fsdp2_config.yaml
SAVE_PATH: &SAVE_PATH save_dir
MM_TOOL_PATH: ./mindspeed_mm/tools/tools.json
gpt_args:
context_parallel_size: 1
context_parallel_algo: ulysses_cp_algo
use_torch_fsdp2: true
fsdp2_config_path: *FSDP2_PATH
ckpt_format: torch_dcp
clip_grad: 0.0
micro_batch_size: 1
global_batch_size: 8
optimizer_selection: fused_torch_adamw
lr: 1.0e-5
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.999
lr_decay_style: cosine
lr_warmup_fraction: 0.1
train_iters: 10000
no_load_optim: true
no_load_rng: true
no_save_optim: true
no_save_rng: true
seed: 42
log_interval: 1
save_interval: 10000
eval_interval: 10000
eval_iters: 5000
save: *SAVE_PATH
log_tps: true
load: *MM_MODEL_LOAD_PATH
use_cpu_initialization: true
init_model_with_meta_device: true
normalization: RMSNorm
use_fused_rmsnorm: true
swiglu: true
use_fused_swiglu: true
use_flash_attn: true
vocab_size: 152064
seq_length: 1024
make_vocab_size_divisible_by: 1
tokenizer_type: NullTokenizer
no_gradient_accumulation_fusion: true
untie_embeddings_and_output_weights: true
no_masked_softmax_fusion: true
num_workers: 8
lora_args:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.0
lora_target_modules: v_proj,o_proj,k_proj,q_proj
lora_register_forward_hook: embed_tokens
data:
dataset_param:
dataset_type: huggingface
attr:
system: null
images: images
videos: null
messages: messages
role_tag: role
content_tag: content
user_tag: user
assistant_tag: assistant
observation_tag: null
function_tag: null
system_tag: null
preprocess_parameters:
model_name_or_path: *HF_MODEL_LOAD_PATH
use_fast_tokenizer: true
split_special_tokens: false
image_max_pixels: 262144
image_min_pixels: 1024
video_max_pixels: 16384
video_min_pixels: 0
video_fps: 2.0
video_maxlen: 64
basic_parameters:
template: qwen3_vl_nothink
enable_thinking: false
train_on_prompt: false
mask_history: false
tool_format: null
dataset_dir: ./data
dataset: *DATASET_PATH
cache_dir: ./data/cache_dir
overwrite_cache: false
preprocessing_batch_size: 1000
preprocessing_num_workers: 16
max_samples: null
dataloader_param:
pin_memory: true
shuffle: true
dataloader_mode: sampler
drop_last: true
sampler_type: BaseRandomBatchSampler
collate_param:
model_name: qwen3vl
ignore_pad_token_for_loss: true
model:
model_id: qwen3_vl_moe
init_from_hf_path: *HF_MODEL_LOAD_PATH
image_encoder:
vision_encoder:
model_id: qwen3vit
num_layers: 27
hidden_size: 1152
num_attention_heads: 16
freeze: true
attn_implementation: flash_attention_2
attn_layout: TND
synchronize_per_layer: true
recompute_granularity: full
vision_projector:
model_id: lnmlp
num_layers: 1
freeze: true
text_decoder:
model_id: qwen3lm
num_layers: 48
hidden_size: 2048
num_attention_heads: 32
max_position_embeddings: 262144
freeze: false
use_npu_fused_moe: false
attn_implementation: flash_attention_2
attn_layout: TND
is_causal: false
activation_offload: false
synchronize_per_layer: true
recompute_granularity: full
loss_cfg:
compute_mode: default
chunk_size: 1024
router_aux_loss_coef: 0.0
loss_type: default
patch:
clip_grad_async: true
bridge_patch: false