seed: 0

output_dir: './output' # path to save checkpoint/strategy

load_checkpoint: '' # 必填

src_strategy_path_or_dir: ''

auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model

only_save_strategy: False

resume_training: False

run_mode: 'predict'



# trainer config

trainer:

  type: CausalLanguageModelingTrainer

  model_name: 'yi_34b'



# runner config

runner_config:

  epochs: 2

  batch_size: 32

  sink_mode: True

  sink_size: 2



# optimizer

optimizer:

  type: AdamWeightDecayX

  beta1: 0.9

  beta2: 0.95

  eps: 1.e-8 # 1e-8

  learning_rate: 5.e-5



# lr schedule

lr_schedule:

  type: CosineWithWarmUpLR

  learning_rate: 5.e-5

  lr_end: 0

  warmup_ratio: 0

  total_steps: -1 # -1 means it will load the total steps of the dataset



# dataset

train_dataset: &train_dataset

  data_loader:

    type: MindDataset

    dataset_dir: ""

    shuffle: True

  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.

  num_parallel_workers: 8

  python_multiprocessing: False

  drop_remainder: True

  batch_size: 4

  repeat: 1

  numa_enable: False

  prefetch_size: 1

train_dataset_task:

  type: CausalLanguageModelDataset

  dataset_config: *train_dataset

# if True, do evaluate during the training process. if false, do nothing.

# note that the task trainer should support _evaluate_in_training function.

do_eval: False

eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.

eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.



# eval dataset

eval_dataset: &eval_dataset

  data_loader:

    type: MindDataset

    dataset_dir: ""

    shuffle: False

  input_columns: ["input_ids"]

  num_parallel_workers: 8

  python_multiprocessing: False

  drop_remainder: False

  repeat: 1

  numa_enable: False

  prefetch_size: 1

eval_dataset_task:

  type: CausalLanguageModelDataset

  dataset_config: *eval_dataset



use_parallel: True

# parallel context config

parallel:

  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel

  gradients_mean: False

  enable_alltoall: False

  full_batch: True

  search_mode: "sharding_propagation"

  enable_parallel_optimizer: False

  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"

  parallel_optimizer_config:

    gradient_accumulation_shard: False

    parallel_optimizer_threshold: 64

# default parallel of device num = 8 for Atlas 800

parallel_config:

  data_parallel: 1

  model_parallel: 4

  pipeline_stage: 1

  use_seq_parallel: False

  micro_batch_num: 4

  vocab_emb_dp: True

  gradient_aggregation_group: 4

# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.

micro_batch_interleave_num: 1



# recompute config

recompute_config:

  recompute: False

  select_recompute: False

  parallel_optimizer_comm_recompute: False

  mp_comm_recompute: True

  recompute_slice_activation: True



# callbacks

callbacks:

  - type: MFLossMonitor

  - type: CheckpointMonitor

    prefix: "yi_34b"

    save_checkpoint_steps: 100

    integrated_save: False

    async_save: False

  - type: ObsMonitor



# mindspore context init config

context:

  mode: 0 #0--Graph Mode; 1--Pynative Mode

  device_target: "Ascend"

  enable_graph_kernel: False

  max_call_depth: 10000

  max_device_memory: "58GB"

  save_graphs: False

  save_graphs_path: "./graph"



# model config

model:

  model_config:

    type: LlamaConfig

    batch_size: 32 # add for increase predict

    seq_length: 16384

    hidden_size: 7168

    num_layers: 60

    num_heads: 56

    max_position_embedding: 4096

    vocab_size: 64000

    intermediate_size: 20480

    multiple_of: 256

    n_kv_heads: 8

    rms_norm_eps: 1.0e-5

    bos_token_id: 6

    eos_token_id: 7

    pad_token_id: 0

    ignore_token_id: -100

    theta: 5000000.0

    compute_dtype: "bfloat16"

    layernorm_compute_type: "float32"

    softmax_compute_type: "float32"

    rotary_dtype: "bfloat16"

    param_init_type: "bfloat16"

    use_past: True

    pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2

    extend_method: "None" # support "None", "PI", "NTK"

    compute_in_2d: False

    use_flash_attention: True # FA can accelerate training or finetune

    block_size: 128

    num_blocks: 2048

    offset: 0

    use_past_shard: False

    checkpoint_name_or_path: ""

    repetition_penalty: 1.0

    max_decode_length: 4074

    top_k: 40

    top_p: 0.8

    temperature: 0.6

    do_sample: False

    is_dynamic: True

  arch:

    type: LlamaForCausalLM



processor:

  return_tensors: ms

  tokenizer:

    additional_special_tokens: ['<|im_start|>','<|im_end|>','<|im_sep|>']

    unk_token: '<unk>'

    bos_token: '<|startoftext|>'

    eos_token: '<|endoftext|>'

    pad_token: '<unk>'

    type: LlamaTokenizer

    vocab_file: "" # 必填

    add_bos_token: False

    add_eos_token: False

    chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

  type: LlamaProcessor



# metric

metric:

  type: PerplexityMetric



# wrapper cell config

runner_wrapper:

  type: MFTrainOneStepCell

  scale_sense:

    type: DynamicLossScaleUpdateCell

    loss_scale_value: 65536

    scale_factor: 2

    scale_window: 1000

  use_clip_grad: True



eval_callbacks:

  - type: ObsMonitor



auto_tune: False

filepath_prefix: './autotune'

autotune_per_step: 10



profile: False

profile_start_step: 1

profile_stop_step: 10

init_start_profile: False

profile_communication: False

profile_memory: True

layer_scale: False

layer_decay: 0.65

lr_scale_factor: 256



# aicc

remote_save_url: "Please input obs url on AICC platform."