"""
Fine-tuning the library models for sequence to sequence.
"""
import logging
import os
import sys
import json
import numpy as np
import jieba
import random
import pandas as pd
import datasets
import torch
import torch_npu
import transformers
import deepspeed_npu
from torch_npu.contrib import transfer_to_npu
from arguments import ModelArguments, DataTrainingArguments
from datasets import load_dataset
from rouge_chinese import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from trainer_seq2seq import Seq2SeqTrainer
from transformers import (
AutoConfig,
AutoModel,
AutoTokenizer,
DataCollatorForSeq2Seq,
HfArgumentParser,
Seq2SeqTrainingArguments,
set_seed,
)
logger = logging.getLogger(__name__)
def seed_all(seed=1234, mode=True):
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.use_deterministic_algorithms(mode)
torch_npu.npu.manual_seed_all(seed)
torch_npu.npu.manual_seed(seed)
def main():
torch.distributed.default_pg_timeout = 7200
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
if training_args.NPU_VISIBLE_DEVICES:
torch.npu.set_device(torch.device(f"npu:{training_args.NPU_VISIBLE_DEVICES}"))
torch.npu.set_compile_mode(jit_compile=True)
seed_all(training_args.seed)
data_files = {}
if data_args.train_file is not None:
data_files["train"] = data_args.train_file
extension = data_args.train_file.split(".")[-1]
if data_args.validation_file is not None:
data_files["validation"] = data_args.validation_file
extension = data_args.validation_file.split(".")[-1]
if data_args.test_file is not None:
data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=model_args.cache_dir,
use_auth_token=True if model_args.use_auth_token else None,
)
config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
config.pre_seq_len = model_args.pre_seq_len
config.prefix_projection = model_args.prefix_projection
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
if training_args.do_train:
column_names = raw_datasets["train"].column_names
elif training_args.do_eval:
column_names = raw_datasets["validation"].column_names
elif training_args.do_predict:
column_names = raw_datasets["test"].column_names
else:
logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
return
prompt_column = data_args.prompt_column
response_column = data_args.response_column
history_column = data_args.history_column
def preprocess_function_train(examples):
max_seq_length = data_args.max_source_length + data_args.max_target_length
model_inputs = {
"input_ids": [],
"labels": [],
}
for i in range(len(examples[prompt_column])):
if examples[prompt_column][i] and examples[response_column][i]:
query, answer = examples[prompt_column][i], examples[response_column][i]
if history_column is None:
prompt = query
else:
prompt = ""
history = examples[history_column][i]
for turn_idx, (old_query, response) in enumerate(history):
prompt += "[Round {}]\n问:{}\n答:{}\n".format(turn_idx, old_query, response)
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
prompt = prefix + prompt
a_ids = tokenizer.encode(text=prompt, add_special_tokens=False)
b_ids = tokenizer.encode(text=answer, add_special_tokens=False)
if len(a_ids) > data_args.max_source_length - 1:
a_ids = a_ids[: data_args.max_source_length - 1]
if len(b_ids) > data_args.max_target_length - 2:
b_ids = b_ids[: data_args.max_target_length - 2]
input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)
context_length = input_ids.index(tokenizer.bos_token_id)
mask_position = context_length - 1
labels = [-100] * context_length + input_ids[mask_position + 1:]
pad_len = max_seq_length - len(input_ids)
input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
labels = labels + [tokenizer.pad_token_id] * pad_len
if data_args.ignore_pad_token_for_loss:
labels = [(l if l != tokenizer.pad_token_id else -100) for l in labels]
model_inputs["input_ids"].append(input_ids)
model_inputs["labels"].append(labels)
return model_inputs
def preprocess(phase: str):
if phase not in raw_datasets:
raise ValueError("--do_train requires a train dataset")
train_dataset = raw_datasets["train"]
if data_args.max_train_samples is not None:
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
train_dataset = train_dataset.select(range(max_train_samples))
with training_args.main_process_first(desc="train dataset map pre-processing"):
train_dataset = train_dataset.map(
preprocess_function_train,
batched=True,
num_proc=data_args.preprocessing_num_workers,
remove_columns=column_names,
load_from_cache_file=not data_args.overwrite_cache,
desc="Running tokenizer on train dataset",
)
model_inputs_datasets = datasets.Dataset.from_pandas(pd.DataFrame(train_dataset))
model_inputs_datasets.save_to_disk(f"{phase}_datasets")
if training_args.do_train:
preprocess("train")
elif training_args.do_eval:
preprocess("validation")
elif training_args.do_predict:
preprocess("test")
if __name__ == "__main__":
main()