""" Convert dataset to HDF5
This script preprocesses a dataset and saves it (images and labels) to
an HDF5 file for improved I/O.
"""
from argparse import ArgumentParser
import h5py as h5
from tqdm import tqdm
import utils
def prepare_parser():
usage = 'Parser for ImageNet HDF5 scripts.'
parser = ArgumentParser(description=usage)
parser.add_argument(
'--dataset', type=str, default='I128',
help='Which Dataset to train on, out of I128, I256, C10, C100;'
'Append "_hdf5" to use the hdf5 version for ISLVRC (default: %(default)s)')
parser.add_argument(
'--data_root', type=str, default='data',
help='Default location where data is stored (default: %(default)s)')
parser.add_argument(
'--batch_size', type=int, default=256,
help='Default overall batchsize (default: %(default)s)')
parser.add_argument(
'--num_workers', type=int, default=16,
help='Number of dataloader workers (default: %(default)s)')
parser.add_argument(
'--chunk_size', type=int, default=500,
help='Default overall batchsize (default: %(default)s)')
parser.add_argument(
'--compression', action='store_true', default=False,
help='Use LZF compression? (default: %(default)s)')
return parser
def run(config):
if 'hdf5' in config['dataset']:
raise ValueError('Reading from an HDF5 file which you will probably be '
'about to overwrite! Override this error only if you know '
'what you''re doing!')
config['image_size'] = utils.imsize_dict[config['dataset']]
config['compression'] = 'lzf' if config['compression'] else None
kwargs = {'num_workers': config['num_workers'], 'pin_memory': False, 'drop_last': False}
train_loader = utils.get_data_loaders(dataset=config['dataset'],
batch_size=config['batch_size'],
shuffle=False,
data_root=config['data_root'],
use_multiepoch_sampler=False,
**kwargs)[0]
print('Starting to load %s into an HDF5 file with chunk size %i and compression %s...' % (
config['dataset'], config['chunk_size'], config['compression']))
for i, (x, y) in enumerate(tqdm(train_loader)):
x = (255 * ((x + 1) / 2.0)).byte().numpy()
y = y.numpy()
if i == 0:
with h5.File(config['data_root'] + '/ILSVRC%i.hdf5' % config['image_size'], 'w') as f:
print('Producing dataset of len %d' % len(train_loader.dataset))
imgs_dset = f.create_dataset('imgs', x.shape, dtype='uint8', maxshape=(
len(train_loader.dataset), 3, config['image_size'], config['image_size']),
chunks=(
config['chunk_size'], 3, config['image_size'], config['image_size']),
compression=config['compression'])
print('Image chunks chosen as ' + str(imgs_dset.chunks))
imgs_dset[...] = x
labels_dset = f.create_dataset('labels', y.shape, dtype='int64', maxshape=(len(train_loader.dataset),),
chunks=(config['chunk_size'],), compression=config['compression'])
print('Label chunks chosen as ' + str(labels_dset.chunks))
labels_dset[...] = y
else:
with h5.File(config['data_root'] + '/ILSVRC%i.hdf5' % config['image_size'], 'a') as f:
f['imgs'].resize(f['imgs'].shape[0] + x.shape[0], axis=0)
f['imgs'][-x.shape[0]:] = x
f['labels'].resize(f['labels'].shape[0] + y.shape[0], axis=0)
f['labels'][-y.shape[0]:] = y
def main():
parser = prepare_parser()
config = vars(parser.parse_args())
print(config)
run(config)
if __name__ == '__main__':
main()