parallel:
fully_shard_parallel_size: auto
fsdp_plan:
apply_modules:
- vision_tower
- vision_tower.encoder.blocks.{*}
- mm_projector
- language_model.model
- language_model.model.embed_tokens
- language_model.model.layers.{*}
- language_model.model.layers.{*}.mlp.experts
- language_model.lm_head
hook_modules:
- language_model.model.layers.{*}
num_to_forward_prefetch: 1
num_to_backward_prefetch: 1
param_dtype: bf16
reduce_dtype: fp32
ulysses_parallel_size: 1
expert_parallel_size: 128
ep_plan:
apply_modules:
- language_model.model.layers.{*}.mlp.experts
data:
dataset_param:
dataset_type: huggingface
attr:
images: images
messages: messages
role_tag: role
content_tag: content
user_tag: user
assistant_tag: assistant
preprocess_parameters:
model_name_or_path: &HF_MODEL_LOAD_PATH mindspeed_mm/fsdp/models/kimik2_5
trust_remote_code: true
use_fast_tokenizer: true
split_special_tokens: false
image_max_pixels: 262144
image_min_pixels: 1024
video_max_pixels: 16384
video_min_pixels: 0
video_fps: 2.0
video_maxlen: 64
basic_parameters:
cutoff_len: 1024
template: kimi_k25
enable_thinking: false
train_on_prompt: false
mask_history: false
dataset_dir: /data
dataset: &DATASET_PATH /data/mllm_format_llava_instruct_data.json
cache_dir: ./cache_dir/
overwrite_cache: false
preprocessing_batch_size: 1000
preprocessing_num_workers: 16
max_samples: null
dataloader_param:
pin_memory: true
shuffle: true
dataloader_mode: sampler
drop_last: true
sampler_type: BaseRandomBatchSampler
num_workers: 8
collate_param:
model_name: qwen3vl
ignore_pad_token_for_loss: true
enable_preload: false
model:
model_id: kimi_k25
model_name_or_path: *HF_MODEL_LOAD_PATH
trust_remote_code: true
features:
loss_cfg:
loss_type: default
enable_chunk_loss: true
chunkloss_plan:
apply_module: language_model.lm_head
chunk_size: 1024
recompute: true
recompute_plan:
apply_modules:
- vision_tower.encoder.blocks.{*}
- language_model.model.layers.{*}
enable_activation_offload: false
activation_offload_plan:
apply_modules:
- vision_tower.encoder.blocks.{*}
- language_model.model.layers.{*}
enable_chunk_mbs: false
chunkmbs_plan:
apply_modules:
- language_model.model.layers.{*}
chunk_mbs: 1
batch_dim: 0
chunk_arg_indexs: [ 0 ]
chunk_kwarg_names: [ "position_embeddings", "position_ids", "rope_deltas", "attention_mask" ]
training:
micro_batch_size: 1
gradient_accumulation_steps: 1
seed: 42
lr: 1.0e-5
lr_decay_style: cosine
lr_warmup_ratio: 0.1
weight_decay: 0
train_iters: 10000
clip_grad: 0.0
init_model_with_meta_device: true
optimizer: adamw
adam_fused: true
save_interval: 10000
use_deter_comp: false
load_rank0_and_broadcast: false
plugin:
- mindspeed_mm/fsdp/models/kimik2_5
- mindspeed_mm/fsdp/data/datasets/huggingface
tools:
profile:
enable: false
profile_type: static
ranks: [0]
static_param:
level: level1
with_stack: false
with_memory: false
record_shapes: false
with_cpu: true
save_path: ./profiling
start_step: 10
end_step: 11
data_simplification: false
aic_metrics_type: PipeUtilization
memory_profile:
enable: false
start_step: 1
end_step: 2
save_path: ./memory_snapshot
dump_ranks: [0]
stacks: all
max_entries: null
mem_info: false