seed: 0
output_dir: './output'
load_checkpoint: ''
use_parallel: True
run_mode: 'predict'
use_legacy: False
load_ckpt_format: "safetensors"
trainer:
type: CausalLanguageModelingTrainer
model_name: deepseek_v3
parallel_config:
data_parallel: 1
model_parallel: 32
pretrained_model_dir: '/path/hf_dir'
model:
model_config:
compute_dtype: "bfloat16"
layernorm_compute_dtype: "bfloat16"
softmax_compute_dtype: "float32"
rotary_dtype: "bfloat16"
params_dtype: "bfloat16"
moe_router_fusion: True
block_size: 128
num_blocks: 512
use_fused_mla: False
context:
mode: 0
max_device_memory: "59GB"
device_id: 0
device_target: "Ascend"
affinity_cpu_list: None
parallel:
parallel_mode: "MANUAL_PARALLEL"
full_batch: False