script_path=$(pwd)

export PYTHONPATH=/home/ldwang/WorkSpace/FlagAI.ftgreat

PREPROCESS_DATA_TOOL=$PYTHONPATH/script/preprocess_data_flagai_args.py
INPUT_DIR=/share/project/bowen/pretrain_dataset_batch3
OUTPUT_DIR=/share/project/ldwang/data/indexed_dataset/batch1_tok100k

TOKENIZER_DIR=/home/ldwang/WorkSpace/FlagAI.ftgreat/examples/gpt3_pretrain
TOKENIZER_NAME=gpt2_new_100k

cd $OUTPUT_DIR

##
degree=2

i=0
for file in $(ls $INPUT_DIR/cn/wudao_base.jsonl)
do
	echo $file
	part=$(awk 'BEGIN{len=split("'${file}'", vec, "/"); subdir=vec[len-1]; split(vec[len], tuple, "."); print subdir"_"tuple[1];}')
	if [ -f $OUTPUT_DIR/${part}_text_document.idx ];then
		echo "PassBy", $OUTPUT_DIR/${part}_text_document.idx
		continue
	fi
	echo "Processing", $part
	python $PREPROCESS_DATA_TOOL --input $file --output-prefix $part --workers 16 --chunk-size 256 --model-name $TOKENIZER_NAME --model-dir $TOKENIZER_DIR &
	i=`expr $i + 1`
	echo $i
	[ `expr $i % $degree` -eq 0 ] && wait
done