seed: 0

output_dir: './output' # path to save checkpoint/strategy

load_checkpoint: './internlm.ckpt'

src_strategy_path_or_dir: ''

auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model

only_save_strategy: False

resume_training: False

use_parallel: True

run_mode: 'finetune'



# trainer config

trainer:

  type: CausalLanguageModelingTrainer

  model_name: 'internlm_7b_lora'



# runner config

runner_config:

  epochs: 1

  batch_size: 4

  sink_mode: True

  sink_size: 2



# optimizer

optimizer:

  type: FP32StateAdamWeightDecay

  beta1: 0.9

  beta2: 0.999

  eps: 1.e-8

  weight_decay: 0.01



# lr sechdule

lr_schedule:

  type: CosineWithWarmUpLR

  learning_rate: 5.e-5

  warmup_ratio: 0.03

  total_steps: -1 # -1 means it will load the total steps of the dataset



# dataset

train_dataset: &train_dataset

  data_loader:

    type: MindDataset

    dataset_dir: ""

    shuffle: True

  input_columns: ["input_ids", "labels"]  # "input_ids", "labels" , labels are used in instruction finetune.

  num_parallel_workers: 8

  python_multiprocessing: False

  drop_remainder: True

  repeat: 1

  numa_enable: False

  prefetch_size: 1

train_dataset_task:

  type: CausalLanguageModelDataset

  dataset_config: *train_dataset

# if True, do evaluate during the training process. if false, do nothing.

# note that the task trainer should support _evaluate_in_training function.

do_eval: False

eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.

eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.



# eval dataset

eval_dataset: &eval_dataset

  data_loader:

    type: MindDataset

    dataset_dir: ""

    shuffle: False

  input_columns: ["input_ids"]

  num_parallel_workers: 8

  python_multiprocessing: False

  drop_remainder: False

  repeat: 1

  numa_enable: False

  prefetch_size: 1

eval_dataset_task:

  type: CausalLanguageModelDataset

  dataset_config: *eval_dataset



# default parallel of device num = 8 for Atlas 800T A2

parallel_config:

  data_parallel: 8

  model_parallel: 1

  pipeline_stage: 1

  micro_batch_num: 1

  vocab_emb_dp: True

  gradient_aggregation_group: 4

# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.

micro_batch_interleave_num: 1



# recompute config

recompute_config:

  recompute: True

  parallel_optimizer_comm_recompute: False

  mp_comm_recompute: True

  recompute_slice_activation: True



# callbacks

callbacks:

  - type: MFLossMonitor

  - type: CheckpointMointor

    prefix: "internlm_7b_lora"

    save_checkpoint_steps: 500

    save_trainable_params: False    # Whether to save fine-tuned weights additionally.

    integrated_save: False

    async_save: False

    keep_checkpoint_max: 3

  - type: ObsMonitor



# mindspore context init config

context:

  mode: 0 #0--Graph Mode; 1--Pynative Mode

  device_target: "Ascend"

  enable_graph_kernel: False

  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"

  max_call_depth: 10000

  max_device_memory: "59GB"

  save_graphs: False

  save_graphs_path: "./graph"

  device_id: 0



# parallel context config

parallel:

  parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid

  gradients_mean: False

  enable_alltoall: False

  full_batch: True

  search_mode: "sharding_propagation"

  enable_parallel_optimizer: True

  strategy_ckpt_config:

    save_file: "./ckpt_strategy.ckpt"

    only_trainable_params: False

  parallel_optimizer_config:

    gradient_accumulation_shard: False

    parallel_optimizer_threshold: 64



# model config

model:

  model_config:

    type: InternLMConfig

    batch_size: 1 # add for increase predict

    seq_length: 2048

    hidden_size: 4096

    num_layers: 32

    num_heads: 32

    vocab_size: 103168

    multiple_of: 256

    rms_norm_eps: 1.0e-6

    bos_token_id: 1

    eos_token_id: 2

    pad_token_id: 2

    ignore_token_id: -100

    compute_dtype: "float16"

    layernorm_compute_type: "float32"

    softmax_compute_type: "float32"

    rotary_dtype: "float16"

    param_init_type: "float16"

    has_bias: True

    use_past: False

    scaling_factor: 1.0

    extend_method: "None"

    use_flash_attention: False

    offset: 0

    checkpoint_name_or_path: "internlm_7b_lora"

    repetition_penalty: 1.02

    max_decode_length: 100

    top_k: 50

    top_p: 0.9

    do_sample: True

    pet_config:

      pet_type: lora

      # configuration of lora

      lora_rank: 8

      lora_alpha: 16

      lora_dropout: 0.05

      target_modules: '.*wq|.*wk|.*wv|.*wo'

  arch:

    type: InternLMForCausalLM



processor:

  return_tensors: ms

  tokenizer:

    unk_token: '<unk>'

    bos_token: '<s>'

    eos_token: '</s>'

    pad_token: '</s>'

    type: InternLMTokenizer

    vocab_file: './internlm-chat/tokenizer.model'

  type: LlamaProcessor



# metric

metric:

  type: PerplexityMetric



# wrapper cell config

runner_wrapper:

  type: MFTrainOneStepCell

  scale_sense:

    type: DynamicLossScaleUpdateCell

    loss_scale_value: 16384

    scale_factor: 2

    scale_window: 1000

  use_clip_grad: True



eval_callbacks:

  - type: ObsMonitor



auto_tune: False

filepath_prefix: './autotune'

autotune_per_step: 10



profile: False

profile_start_step: 1

profile_stop_step: 10

init_start_profile: False

profile_communication: False

profile_memory: True

layer_scale: False

layer_decay: 0.65

lr_scale_factor: 256



# aicc

remote_save_url: "Please input obs url on AICC platform."