"""
transform alpaca dataset to mindrecord.
"""
import argparse
import json
import os
import numpy as np
from mindspore.mindrecord import FileWriter
from mindformers.tools import logger
from internlm2_tokenizer import InternLM2Tokenizer
np.set_printoptions(threshold=np.inf)
IGNORE_TOKEN_ID = -100
def preprocess(sources, tokenizer, seq_length):
"""From alpaca to mindrecord."""
input_ids = []
labels = []
special_tokenize = {"<s>": tokenizer.encode('')[0],
"</s>": tokenizer.encode('</s>', add_special_tokens=False)[0],
"newline": tokenizer.encode('\n', add_special_tokens=False)[0]}
start_usr_token = tokenizer.encode('<|im_start|>user', add_special_tokens=False) + [special_tokenize["newline"]]
start_ass_token = tokenizer.encode('<|im_start|>assistant', add_special_tokens=False) \
+ [special_tokenize["newline"]]
end_token = tokenizer.encode('<|im_end|>', add_special_tokens=False) + [special_tokenize["newline"]]
for source in sources:
ins_token = tokenizer.encode(source['instruction'], add_special_tokens=False)
inp_token = tokenizer.encode(source['input'], add_special_tokens=False) + [special_tokenize["newline"]]
out_token = tokenizer.encode(source['output'], add_special_tokens=False)
if source['input']:
input_id = [1,
*start_usr_token,
*ins_token,
special_tokenize["newline"],
*inp_token,
*end_token,
*start_ass_token]
else:
input_id = [1, *start_usr_token, *ins_token, *end_token, *start_ass_token]
label = [*out_token, *end_token, 2]
if len(input_id) + len(label) > seq_length:
input_id = (input_id + label)[:seq_length - 1] + [2]
label = np.full_like(input_id, fill_value=IGNORE_TOKEN_ID)
else:
input_pad_length = seq_length - len(input_id)
label_pad_length_head = len(input_id)
label_pad_length_tail = input_pad_length - len(label)
input_id = np.pad(input_id, (0, input_pad_length), 'constant', constant_values=(0, 0))
label = np.pad(label,
(label_pad_length_head, label_pad_length_tail),
'constant',
constant_values=(IGNORE_TOKEN_ID, IGNORE_TOKEN_ID))
input_ids.append(np.array(input_id).astype(np.int32))
labels.append(np.array(label).astype(np.int32))
return dict(input_ids=input_ids, labels=labels)
class SupervisedDataset:
"""Dataset for supervised fine-tuning."""
def __init__(self, raw_data, tokenizer, seq_length):
super(SupervisedDataset, self).__init__()
sources = []
for example in raw_data:
sources.append(example)
data_dict = preprocess(sources, tokenizer, seq_length)
self.input_ids = data_dict.get("input_ids")
self.labels = data_dict.get("labels")
def __len__(self):
return len(self.input_ids)
def __getitem__(self, i):
return dict(
input_ids=self.input_ids[i],
labels=self.labels[i]
)
def tokenize_qa(tokenizer, file_path, seq_length):
file = None
raw_data = None
try:
file = open(file_path, "r")
raw_data = json.load(file)
except FileNotFoundError as file_not_found_error:
logger.error(file_not_found_error)
except UnicodeDecodeError as decode_error:
logger.error(decode_error)
except IOError as io_error:
logger.error(io_error)
except Exception as exception:
logger.error(exception)
finally:
if file is not None:
file.close()
dataset_cls = SupervisedDataset(raw_data, tokenizer, seq_length)
for i, _ in enumerate(dataset_cls):
yield dataset_cls[i]
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--mindrecord_schema", type=str, default="internlm2_alpaca")
parser.add_argument("--input_glob", type=str, default="./alpaca_data.json")
parser.add_argument("--output_file", type=str, default="./alpaca_processed/alpaca.mindrecord")
parser.add_argument("--model_file", type=str, default="./tokenizer.model")
parser.add_argument("--file_partition", type=int, default=1)
parser.add_argument("--seq_length", type=int, default=2048)
args = parser.parse_args()
out_dir, out_file = os.path.split(os.path.abspath(args.output_file))
if not os.path.exists(out_dir):
os.mkdir(out_dir)
schema = {'input_ids': {"type": "int32", "shape": [-1]},
'labels': {"type": "int32", "shape": [-1]}}
writer = FileWriter(file_name=args.output_file,
shard_num=args.file_partition)
writer.add_schema(schema, args.mindrecord_schema)
if not os.path.exists(args.model_file):
raise FileNotFoundError(f"file {args.model_file} do not exists.")
transforms_count = 0
word_tokenizer = InternLM2Tokenizer(vocab_file=args.model_file)
for x in tokenize_qa(word_tokenizer, args.input_glob, args.seq_length + 1):
transforms_count += 1
writer.write_raw_data([x])
print(f"Transformed {transforms_count} records.")
writer.commit()
out_file = args.output_file
if args.file_partition > 1:
out_file += '0'
print(f"Transform finished, output files refer: {out_file}.")