batch_size: 10 gradient_accumulation_steps: 1 lr: 1.5e-4 warm_up: 0.01 save_interval: 1000 log_interval: 10 bmt_loss_scale: 131072 save_optim: True save_rng: True eps: 1.e-8