import os
import torch
import torch_npu
from torch_npu.contrib import transfer_to_npu
from torch.utils.data import RandomSampler, DistributedSampler, DataLoader
import apex
from classifier_pytorch import run_classifier
from classifier_pytorch.run_classifier import evaluate
from classifier_pytorch.processors import xlnet_collate_fn, collate_fn
from classifier_pytorch.tools.common import seed_everything
from classifier_pytorch.tools.progressbar import ProgressBar
from classifier_pytorch.transformers import logger, WarmupLinearSchedule
def train(args, train_dataset, model, tokenizer):
""" Train the model """
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn)
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
args.warmup_steps = int(t_total * args.warmup_proportion)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay': 0.0}
]
optimizer = apex.optimizers.NpuFusedAdamW(optimizer_grouped_parameters,
lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, combine_grad=True)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=True)
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
args.train_batch_size * args.gradient_accumulation_steps * \
(torch.distributed.get_world_size() if args.local_rank != -1 else 1))
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
seed_everything(args.seed)
for _ in range(int(args.num_train_epochs)):
pbar = ProgressBar(n_total=len(train_dataloader),desc='Training')
for step, batch in enumerate(train_dataloader):
model.train()
batch = tuple(t.to(args.device) for t in batch)
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'labels': batch[3]}
if args.model_type != 'distilbert':
inputs['token_type_ids'] = batch[2] if args.model_type in ['bert','xlnet','albert','roberta']\
else None
outputs = model(**inputs)
loss = outputs[0]
if args.n_gpu > 1:
loss = loss.mean()
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.clip_optimizer_grad_norm_fused(args.max_grad_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step()
model.zero_grad()
global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
if args.local_rank == -1 :
results = evaluate(args, model, tokenizer)
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", output_dir)
tokenizer.save_vocabulary(vocab_path=output_dir)
pbar(step,{'loss':loss.item()})
print(" ")
if 'cuda' in str(args.device):
torch.cuda.empty_cache()
return global_step, tr_loss / global_step
run_classifier.train = train
if __name__ == "__main__":
run_classifier.main()