source /usr/local/Ascend/cann/set_env.sh

python ./preprocess_data.py --input /data/mindspeed_test/datasets/train-00000-of-00001-a09b74b3ef9c3b56.parquet --tokenizer-name-or-path /data/mindspeed_test/model_weights/Qwen3-0.6B/ --tokenizer-type PretrainedFromHF --handler-name GeneralPretrainHandler --output-prefix /data/mindspeed_test/datasets/alpaca --json-keys text --workers 4 --log-interval 1000