seed: 0

output_dir: './output' # path to save checkpoint/strategy

load_checkpoint: ''

src_strategy_path_or_dir: ''

auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model

only_save_strategy: False

resume_training: False

run_mode: 'finetune'



# trainer config

trainer:

  type: CausalLanguageModelingTrainer

  model_name: 'baichuan2_13b'

# if True, do evaluate during the training process. if false, do nothing.

# note that the task trainer should support _evaluate_in_training function.

do_eval: False



# runner config

runner_config:

  epochs: 1

  batch_size: 1

  sink_mode: True

  sink_size: 2



# optimizer

optimizer:

  type: FP32StateAdamWeightDecay

  beta1: 0.9

  beta2: 0.95

  eps: 1.e-8



# lr sechdule

lr_schedule:

  type: CosineWithWarmUpLR

  learning_rate: 2.e-5 # pretrain:3.e-4

  lr_end: 1.e-6 # pretrain:3.e-5

  warmup_ratio: 0.03

  total_steps: -1 # -1 means it will load the total steps of the dataset



# dataset

train_dataset: &train_dataset

  data_loader:

    type: MindDataset

    dataset_dir: ""

    shuffle: True

  input_columns: ["input_ids", "labels"]  # "input_ids", "labels" , labels are used in instruction finetune.

  num_parallel_workers: 8

  python_multiprocessing: False

  drop_remainder: True

  repeat: 1

  numa_enable: False

  prefetch_size: 1

train_dataset_task:

  type: CausalLanguageModelDataset

  dataset_config: *train_dataset



# eval dataset

eval_dataset: &eval_dataset

  data_loader:

    type: MindDataset

    dataset_dir: ""

    shuffle: False

  input_columns: ["input_ids", "labels"]

  num_parallel_workers: 8

  python_multiprocessing: False

  drop_remainder: False

  repeat: 1

  numa_enable: False

  prefetch_size: 1

eval_dataset_task:

  type: CausalLanguageModelDataset

  dataset_config: *eval_dataset



use_parallel: True

# parallel context config

parallel:

  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel

  gradients_mean: False

  enable_alltoall: False

  full_batch: True

  search_mode: "sharding_propagation"

  enable_parallel_optimizer: True

  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"

  parallel_optimizer_config:

    gradient_accumulation_shard: False

    parallel_optimizer_threshold: 64

# default parallel of device num = 8 for Atlas 800T A2

parallel_config:

  data_parallel: 8

  model_parallel: 1

  pipeline_stage: 1

  use_seq_parallel: False

  micro_batch_num: 1

  vocab_emb_dp: True

  gradient_aggregation_group: 4

# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.

micro_batch_interleave_num: 1



# recompute config

recompute_config:

  recompute: True

  select_recompute: False

  parallel_optimizer_comm_recompute: False

  mp_comm_recompute: True

  recompute_slice_activation: True



# callbacks

callbacks:

  - type: MFLossMonitor

  - type: CheckpointMointor

    prefix: "baichuan2_13b"

    save_checkpoint_steps: 1000

    keep_checkpoint_max: 5

    integrated_save: False

    async_save: False

  - type: ObsMonitor



# mindspore context init config

context:

  mode: 0 #0--Graph Mode; 1--Pynative Mode

  device_target: "Ascend"

  enable_graph_kernel: False

  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"

  max_call_depth: 10000

  max_device_memory: "60GB"

  save_graphs: False

  save_graphs_path: "./graph"

  device_id: 0



# model config

model:

  model_config:

    type: LlamaConfig

    batch_size: 1 # add for increase predict

    seq_length: 4096

    hidden_size: 5120

    num_layers: 40

    num_heads: 40

    vocab_size: 125696

    multiple_of: 128

    rms_norm_eps: 1.0e-6

    bos_token_id: 1

    eos_token_id: 2

    pad_token_id: 0

    ignore_token_id: -100

    compute_dtype: "float16"

    layernorm_compute_type: "float32"

    softmax_compute_type: "float16"

    param_init_type: "float16"

    use_past: True

    pretrain_seqlen: 2048 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2

    extend_method: "None" # support "None", "PI", "NTK"

    compute_in_2d: True

    use_flash_attention: True

    block_size: 16

    num_blocks: 512

    is_dynamic: False

    offset: 0

    use_past_shard: False

    checkpoint_name_or_path: "path/to/baichuan2-13B-Chat.ckpt"

    repetition_penalty: 1

    temperature: 1.0

    max_decode_length: 512

    top_k: 3

    top_p: 1

    do_sample: False

  arch:

    type: Baichuan13BV2ForCausalLM



processor:

  return_tensors: ms

  tokenizer:

    vocab_file: "path/to/tokenizer.model"

    unk_token: '<unk>'

    bos_token: '<s>'

    eos_token: '</s>'

    pad_token: '<unk>'

    type: Baichuan2Tokenizer

  type: LlamaProcessor



# metric

metric:

  type: PerplexityMetric



# wrapper cell config

runner_wrapper:

  type: MFTrainOneStepCell

  scale_sense:

    type: DynamicLossScaleUpdateCell

    loss_scale_value: 65536

    scale_factor: 2

    scale_window: 1000

  use_clip_grad: True



eval_callbacks:

  - type: ObsMonitor



auto_tune: False

filepath_prefix: './autotune'

autotune_per_step: 10



profile: False

profile_start_step: 1

profile_stop_step: 10

init_start_profile: False

profile_communication: False

profile_memory: True

layer_scale: False

layer_decay: 0.65

lr_scale_factor: 256



# aicc

remote_save_url: "Please input obs url on AICC platform."