sharding_size: auto reshard_after_forward: True param_dtype: bf16 reduce_dtype: fp32 cast_forward_inputs: True num_to_forward_prefetch: 1 num_to_backward_prefetch: 1 offload_to_cpu: False