{
"pretrain_dataset": [
{
"params" : {
"input-dataset": "/data/ci/datasets/origin/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"test-out-part": "/data/ci/cache/process_dataset/test_merge_subs/",
"base-out-part": "/data/ci/datasets/processed/base_merge_subs/",
"test-out-merge": "/data/ci/cache/process_dataset/test_merge/",
"base-out-merge": "/data/ci/datasets/processed/base_merge/"
}
}
],
"test_pretrain_datasets_part1": [
{
"params": {
"input": "/data/ci/datasets/origin/0001-alpaca.parquet",
"tokenizer-type": "PretrainedFromHF",
"output-prefix": "/data/ci/cache/process_dataset/test_merge_subs/part1",
"tokenizer-name-or-path": "/data/ci/models/llama2/hf/llama-2-7b-hf",
"cache-dir": "/data/ci/cache/process_dataset/tmp/",
"workers": 4,
"log-interval": 1000
}
}
],
"test_pretrain_datasets_part2": [
{
"params": {
"input": "/data/ci/datasets/origin/0002-alpaca.parquet",
"tokenizer-type": "PretrainedFromHF",
"output-prefix": "/data/ci/cache/process_dataset/test_merge_subs/part2",
"tokenizer-name-or-path": "/data/ci/models/llama2/hf/llama-2-7b-hf",
"cache-dir": "/data/ci/cache/process_dataset/tmp/",
"workers": 4,
"log-interval": 1000
}
}
],
"test_merge_pretrain_datasets": [
{
"params": {
"input": "/data/ci/cache/process_dataset/test_merge_subs/",
"output-prefix": "/data/ci/cache/process_dataset/test_merge/merge",
"merge-group-keys": "text_document"
}
}
],
"instruction_dataset": [
{
"params" : {
"input-dataset": "/data/ci/datasets/origin/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"test-out-part": "/data/ci/cache/process_dataset/test_ins_subs/",
"base-out-part": "/data/ci/datasets/processed/base_ins_subs/",
"test-out-merge": "/data/ci/cache/process_dataset/test_ins_merge/",
"base-out-merge": "/data/ci/datasets/processed/base_ins_merge/"
}
}
],
"test_instruction_datasets_part1": [
{
"params": {
"input": "/data/ci/datasets/origin/0001-alpaca.parquet",
"tokenizer-type": "PretrainedFromHF",
"handler-name": "GeneralInstructionHandler",
"output-prefix": "/data/ci/cache/process_dataset/test_ins_subs/part1",
"tokenizer-name-or-path": "/data/ci/models/llama2/hf/llama-2-7b-hf",
"cache-dir": "/data/ci/cache/process_dataset/tmp/",
"workers": 4,
"log-interval": 1000,
"append-eod": null
}
}
],
"test_instruction_datasets_part2": [
{
"params": {
"input": "/data/ci/datasets/origin/0002-alpaca.parquet",
"tokenizer-type": "PretrainedFromHF",
"handler-name": "GeneralInstructionHandler",
"output-prefix": "/data/ci/cache/process_dataset/test_ins_subs/part2",
"tokenizer-name-or-path": "/data/ci/models/llama2/hf/llama-2-7b-hf",
"cache-dir": "/data/ci/cache/process_dataset/tmp/",
"workers": 4,
"log-interval": 1000,
"append-eod": null
}
}
],
"test_merge_instrction_datasets": [
{
"params": {
"input": "/data/ci/cache/process_dataset/test_ins_subs/",
"output-prefix": "/data/ci/cache/process_dataset/test_ins_merge/merge",
"merge-group-keys": ["packed_attention_mask_document", "packed_input_ids_document", "packed_labels_document"]
}
}
]
}