#! /bin/bash
set -e
set -x
ls -ltrash
rm -rf /data/dlrm/spark
rm -rf /data/dlrm/intermediate_binary
rm -rf /data/dlrm/output
rm -rf /data/dlrm/criteo_parquet
rm -rf /data/dlrm/binary_dataset
download_dir=${download_dir:-'/data/dlrm/criteo'}
./verify_criteo_downloaded.sh ${download_dir}
output_path=${output_path:-'/data/dlrm/output'}
if [ "$3" = "NVTabular" ]; then
echo "Performing NVTabular preprocessing"
./run_NVTabular.sh ${download_dir} ${output_path} $1
preprocessing_version=NVTabular
else
if [ -f ${output_path}/train/_SUCCESS ] \
&& [ -f ${output_path}/validation/_SUCCESS ] \
&& [ -f ${output_path}/test/_SUCCESS ]; then
echo "Spark preprocessing already carried out"
else
echo "Performing spark preprocessing"
./run_spark.sh $2 ${download_dir} ${output_path} $1
fi
preprocessing_version=Spark
fi
conversion_intermediate_dir=${conversion_intermediate_dir:-'/data/dlrm/intermediate_binary'}
final_output_dir=${final_output_dir:-'/data/dlrm/binary_dataset'}
source ${DGX_VERSION}_config.sh
if [ -d ${final_output_dir}/train ] \
&& [ -d ${final_output_dir}/validation ] \
&& [ -d ${final_output_dir}/test ] \
&& [ -f ${final_output_dir}/feature_spec.yaml ]; then
echo "Final conversion already done"
else
echo "Performing final conversion to a custom data format"
python parquet_to_binary.py --parallel_jobs ${TOTAL_CORES} --src_dir ${output_path} \
--intermediate_dir ${conversion_intermediate_dir} \
--dst_dir ${final_output_dir}
cp "${output_path}/model_size.json" "${final_output_dir}/model_size.json"
python split_dataset.py --dataset "${final_output_dir}" --output "${final_output_dir}/split"
rm ${final_output_dir}/train_data.bin
rm ${final_output_dir}/validation_data.bin
rm ${final_output_dir}/test_data.bin
rm ${final_output_dir}/model_size.json
mv ${final_output_dir}/split/* ${final_output_dir}
rm -rf ${final_output_dir}/split
fi
echo "Done preprocessing the Criteo Kaggle Dataset"