import argparse
import time
import math
import os, sys
import itertools
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from data_utils import get_lm_corpus
from mem_transformer import MemTransformerLM
from utils.exp_utils import create_exp_dir
from utils.data_parallel import BalancedDataParallel
from apex import amp
import torch.distributed as dist
import apex
import warnings
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
parser.add_argument('--data', type=str, default='../data/enwik8',
help='location of the data corpus')
parser.add_argument('--dataset', type=str, default='enwik8',
choices=['wt103', 'lm1b', 'enwik8', 'text8'],
help='dataset name')
parser.add_argument('--n_layer', type=int, default=12,
help='number of total layers')
parser.add_argument('--n_head', type=int, default=8,
help='number of heads')
parser.add_argument('--d_head', type=int, default=64,
help='head dimension')
parser.add_argument('--d_embed', type=int, default=-1,
help='embedding dimension')
parser.add_argument('--d_model', type=int, default=512,
help='model dimension')
parser.add_argument('--d_inner', type=int, default=2048,
help='inner dimension in FF')
parser.add_argument('--dropout', type=float, default=0.1,
help='global dropout rate')
parser.add_argument('--dropatt', type=float, default=0.0,
help='attention probability dropout rate')
parser.add_argument('--init', default='normal', type=str,
help='parameter initializer to use.')
parser.add_argument('--emb_init', default='normal', type=str,
help='parameter initializer to use.')
parser.add_argument('--init_range', type=float, default=0.1,
help='parameters initialized by U(-init_range, init_range)')
parser.add_argument('--emb_init_range', type=float, default=0.01,
help='parameters initialized by U(-init_range, init_range)')
parser.add_argument('--init_std', type=float, default=0.02,
help='parameters initialized by N(0, init_std)')
parser.add_argument('--proj_init_std', type=float, default=0.01,
help='parameters initialized by N(0, init_std)')
parser.add_argument('--optim', default='adam', type=str,
choices=['adam', 'sgd', 'adagrad'],
help='optimizer to use.')
parser.add_argument('--lr', type=float, default=0.00025,
help='initial learning rate (0.00025|5 for adam|sgd)')
parser.add_argument('--mom', type=float, default=0.0,
help='momentum for sgd')
parser.add_argument('--scheduler', default='cosine', type=str,
choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
help='lr scheduler to use.')
parser.add_argument('--warmup_step', type=int, default=0,
help='upper epoch limit')
parser.add_argument('--decay_rate', type=float, default=0.5,
help='decay factor when ReduceLROnPlateau is used')
parser.add_argument('--lr_min', type=float, default=0.0,
help='minimum learning rate during annealing')
parser.add_argument('--clip', type=float, default=0.25,
help='gradient clipping')
parser.add_argument('--clip_nonemb', action='store_true',
help='only clip the gradient of non-embedding params')
parser.add_argument('--max_step', type=int, default=400000,
help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=22,
help='batch size')
parser.add_argument('--batch_chunk', type=int, default=1,
help='split batch into chunks to save memory')
parser.add_argument('--tgt_len', type=int, default=512,
help='number of tokens to predict')
parser.add_argument('--eval_tgt_len', type=int, default=128,
help='number of tokens to predict for evaluation')
parser.add_argument('--ext_len', type=int, default=0,
help='length of the extended context')
parser.add_argument('--mem_len', type=int, default=512,
help='length of the retained previous heads')
parser.add_argument('--not_tied', action='store_true',
help='do not tie the word embedding and softmax weights')
parser.add_argument('--seed', type=int, default=1111,
help='random seed')
parser.add_argument('--adaptive', action='store_true',
help='use adaptive softmax')
parser.add_argument('--div_val', type=int, default=1,
help='divident value for adapative input and softmax')
parser.add_argument('--pre_lnorm', action='store_true',
help='apply LayerNorm to the input instead of the output')
parser.add_argument('--varlen', action='store_true',
help='use variable length')
parser.add_argument('--multi_gpu', action='store_true',
help='use multiple GPU')
parser.add_argument('--log-interval', type=int, default=200,
help='report interval')
parser.add_argument('--eval-interval', type=int, default=4000,
help='evaluation interval')
parser.add_argument('--work_dir', default='LM-TFM', type=str,
help='experiment directory.')
parser.add_argument('--restart', action='store_true',
help='restart training from the saved checkpoint')
parser.add_argument('--restart_dir', type=str, default='',
help='restart dir')
parser.add_argument('--debug', action='store_true',
help='run in debug mode (do not create exp dir)')
parser.add_argument('--same_length', action='store_true',
help='use the same attn length for all tokens')
parser.add_argument('--attn_type', type=int, default=0,
help='attention type. 0 for ours, 1 for Shaw et al,'
'2 for Vaswani et al, 3 for Al Rfou et al.')
parser.add_argument('--clamp_len', type=int, default=-1,
help='use the same pos embeddings after clamp_len')
parser.add_argument('--eta_min', type=float, default=0.0,
help='min learning rate for cosine scheduler')
parser.add_argument('--gpu0_bsz', type=int, default=-1,
help='batch size on gpu 0')
parser.add_argument('--max_eval_steps', type=int, default=-1,
help='max eval steps')
parser.add_argument('--sample_softmax', type=int, default=-1,
help='number of samples in sampled softmax')
parser.add_argument('--patience', type=int, default=0,
help='patience')
parser.add_argument('--finetune_v2', action='store_true',
help='finetune v2')
parser.add_argument('--finetune_v3', action='store_true',
help='finetune v3')
parser.add_argument('--static-loss-scale', type=float, default=128.0,
help='Static loss scale, positive power of 2 values can '
'improve fp16 convergence.')
parser.add_argument('--dynamic-loss-scale', action='store_true',
help='Use dynamic loss scaling. If supplied, this argument'
' supersedes --static-loss-scale.')
parser.add_argument('--dist-backend', type=str, default='hccl')
parser.add_argument('--world-size', type=int, default=-1)
parser.add_argument('--rank', type=int, default=-1)
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument('--addr', type=str, default='127.0.0.1')
parser.add_argument('--device_num', type=int, default=-1)
parser.add_argument('--workers', type=int, default=32)
parser.add_argument('--device-list', default='', type=str)
parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:50000')
parser.add_argument('--device', type=str, default='npu')
parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
warnings.filterwarnings('ignore')
def main():
args = parser.parse_args()
args.tied = not args.not_tied
torch.manual_seed(args.seed)
global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
os.environ['MASTER_ADDR'] = args.addr
os.environ['MASTER_PORT'] = '29888'
os.environ['LOCAL_DEVICE_ID'] = str(0)
print("+++++++++++++++++++++++++++LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
if args.device_list != '':
ngpus_per_node = len(args.device_list.split(','))
elif args.device_num != -1:
ngpus_per_node = args.device_num
elif args.device == 'npu':
ngpus_per_node = int(os.environ["RANK_SIZE"])
else:
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
args.world_size = ngpus_per_node * args.world_size
if args.device == 'npu':
main_worker(args.local_rank, ngpus_per_node,args)
else:
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
if args.d_embed < 0:
args.d_embed = args.d_model
assert args.ext_len >= 0, 'extended context length must be non-negative'
assert args.batch_size % args.batch_chunk == 0
args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
logging = create_exp_dir(args.work_dir,
scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
if args.device_list != '':
args.gpu = int(args.device_list.split(',')[gpu])
else:
args.gpu = gpu
print("[npu id:", args.gpu, "]", "++++++++++++++++ before set LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
os.environ['LOCAL_DEVICE_ID'] = str(args.gpu)
print("[npu id:", args.gpu, "]", "++++++++++++++++ LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID'])
if args.gpu is not None:
print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
args.rank = args.rank * ngpus_per_node + gpu
if args.device == 'npu':
dist.init_process_group(backend=args.dist_backend,
world_size=args.world_size, rank=args.rank)
else:
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
loc = 'npu:{}'.format(args.gpu)
torch.npu.set_device(loc)
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
print("[npu id:", args.gpu, "]", "===============main_worker()=================")
print("[npu id:", args.gpu, "]", args)
print("[npu id:", args.gpu, "]", "===============main_worker()=================")
corpus = get_lm_corpus(args.data, args.dataset)
ntokens = len(corpus.vocab)
args.n_token = ntokens
eval_batch_size = 10
tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
device=loc, ext_len=args.ext_len)
va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
device=loc, ext_len=args.ext_len)
te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len,
device=loc, ext_len=args.ext_len)
cutoffs, tie_projs = [], [False]
if args.adaptive:
assert args.dataset in ['wt103', 'lm1b']
if args.dataset == 'wt103':
cutoffs = [20000, 40000, 200000]
tie_projs += [True] * len(cutoffs)
elif args.dataset == 'lm1b':
cutoffs = [60000, 100000, 640000]
tie_projs += [False] * len(cutoffs)
def init_weight(weight):
if args.init == 'uniform':
nn.init.uniform_(weight, -args.init_range, args.init_range)
elif args.init == 'normal':
nn.init.normal_(weight, 0.0, args.init_std)
def init_bias(bias):
nn.init.constant_(bias, 0.0)
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Linear') != -1:
if hasattr(m, 'weight') and m.weight is not None:
init_weight(m.weight)
if hasattr(m, 'bias') and m.bias is not None:
init_bias(m.bias)
elif classname.find('AdaptiveEmbedding') != -1:
if hasattr(m, 'emb_projs'):
for i in range(len(m.emb_projs)):
if m.emb_projs[i] is not None:
nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
elif classname.find('Embedding') != -1:
if hasattr(m, 'weight'):
init_weight(m.weight)
elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
init_weight(m.cluster_weight)
if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
init_bias(m.cluster_bias)
if hasattr(m, 'out_projs'):
for i in range(len(m.out_projs)):
if m.out_projs[i] is not None:
nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
elif classname.find('LayerNorm') != -1:
if hasattr(m, 'weight'):
nn.init.normal_(m.weight, 1.0, args.init_std)
if hasattr(m, 'bias') and m.bias is not None:
init_bias(m.bias)
elif classname.find('TransformerLM') != -1:
if hasattr(m, 'r_emb'):
init_weight(m.r_emb)
if hasattr(m, 'r_w_bias'):
init_weight(m.r_w_bias)
if hasattr(m, 'r_r_bias'):
init_weight(m.r_r_bias)
if hasattr(m, 'r_bias'):
init_bias(m.r_bias)
def update_dropout(m):
classname = m.__class__.__name__
if classname.find('Dropout') != -1:
if hasattr(m, 'p'):
m.p = args.dropout
def update_dropatt(m):
if hasattr(m, 'dropatt'):
m.dropatt.p = args.dropatt
if args.restart:
with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
args.d_head, args.d_inner, args.dropout, args.dropatt,
tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
same_length=args.same_length, attn_type=args.attn_type,
clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
model.apply(weights_init)
model.word_emb.apply(weights_init)
model = model.to(loc)
ckpt = torch.load(f, map_location=loc)
model.load_state_dict(ckpt)
model.apply(update_dropout)
model.apply(update_dropatt)
else:
model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model,
args.d_head, args.d_inner, args.dropout, args.dropatt,
tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
same_length=args.same_length, attn_type=args.attn_type,
clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
model.apply(weights_init)
model.word_emb.apply(weights_init)
args.n_all_param = sum([p.nelement() for p in model.parameters()])
args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
if args.optim.lower() == 'sgd':
if args.sample_softmax > 0:
dense_params, sparse_params = [], []
for param in model.parameters():
if param.size() == model.word_emb.weight.size():
sparse_params.append(param)
else:
dense_params.append(param)
optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
else:
optimizer = optim.SGD(model.parameters(), lr=args.lr,
momentum=args.mom)
elif args.optim.lower() == 'adam':
if args.sample_softmax > 0:
dense_params, sparse_params = [], []
for param in model.parameters():
if param.size() == model.word_emb.weight.size():
sparse_params.append(param)
else:
dense_params.append(param)
optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
optimizer = optim.Adam(dense_params, lr=args.lr)
else:
optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr)
elif args.optim.lower() == 'adagrad':
optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
model = model.to(loc)
opt_level = "O2"
model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True)
if args.multi_gpu:
if args.gpu0_bsz >= 0:
para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
model, dim=1).to(loc)
else:
para_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False)
else:
para_model = model.to(loc)
if args.scheduler == 'cosine':
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
args.max_step, eta_min=args.eta_min)
if args.sample_softmax > 0:
scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
args.max_step, eta_min=args.eta_min)
elif args.scheduler == 'inv_sqrt':
def lr_lambda(step):
if step == 0 and args.warmup_step == 0:
return 1.
else:
return 1. / (step ** 0.5) if step > args.warmup_step \
else step / (args.warmup_step ** 1.5)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
elif args.scheduler == 'dev_perf':
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
if args.sample_softmax > 0:
scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
elif args.scheduler == 'constant':
pass
if args.restart:
if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
opt_state_dict = torch.load(f, map_location=loc)
optimizer.load_state_dict(opt_state_dict)
else:
print('Optimizer was not saved. Start from scratch.')
logging('=' * 100)
for k, v in args.__dict__.items():
logging(' - {} : {}'.format(k, v))
logging('=' * 100)
logging('#params = {}'.format(args.n_all_param))
logging('#non emb params = {}'.format(args.n_nonemb_param))
def evaluate(eval_iter):
model.eval()
if args.mem_len == 0:
model.reset_length(args.eval_tgt_len,
args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
else:
model.reset_length(args.eval_tgt_len,
args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
total_len, total_loss = 0, 0.
with torch.no_grad():
mems = tuple()
for i, (data, target, seq_len) in enumerate(eval_iter):
if args.max_eval_steps > 0 and i >= args.max_eval_steps:
break
ret = model(data, target, *mems)
loss, mems = ret[0], ret[1:]
loss = loss.mean()
total_loss += seq_len * loss.float().item()
total_len += seq_len
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
model.train()
return total_loss / total_len
def train():
global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
model.train()
if args.batch_chunk > 1:
mems = [tuple() for _ in range(args.batch_chunk)]
else:
mems = tuple()
train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
for batch, (data, target, seq_len) in enumerate(train_iter):
model.zero_grad()
if args.batch_chunk > 1:
data_chunks = torch.chunk(data, args.batch_chunk, 1)
target_chunks = torch.chunk(target, args.batch_chunk, 1)
for i in range(args.batch_chunk):
data_i = data_chunks[i].contiguous()
target_i = target_chunks[i].contiguous()
ret = para_model(data_i, target_i, *mems[i])
loss, mems[i] = ret[0], ret[1:]
loss = loss.float().mean().type_as(loss) / args.batch_chunk
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
with torch.no_grad():
train_loss += loss.float().bool().item()
else:
ret = para_model(data, target, *mems)
loss, mems = ret[0], ret[1:]
loss = loss.float().mean().type_as(loss)
with torch.no_grad():
train_loss += loss.float().item()
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
optimizer.step()
if args.sample_softmax > 0:
optimizer_sparse.step()
train_step += 1
if args.scheduler in ['cosine', 'constant', 'dev_perf']:
if train_step < args.warmup_step:
curr_lr = args.lr * train_step / args.warmup_step
optimizer.param_groups[0]['lr'] = curr_lr
if args.sample_softmax > 0:
optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
else:
if args.scheduler == 'cosine':
scheduler.step(train_step)
if args.sample_softmax > 0:
scheduler_sparse.step(train_step)
elif args.scheduler == 'inv_sqrt':
scheduler.step(train_step)
if train_step % args.log_interval == 0:
cur_loss = train_loss / args.log_interval
elapsed = time.time() - log_start_time
log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
'| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format(
epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len*8/elapsed)
if args.dataset in ['enwik8', 'text8']:
log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
else:
log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
logging(log_str)
train_loss = 0
log_start_time = time.time()
if train_step % args.eval_interval == 0:
print('train_step is :', train_step)
print('ars.eval_interval is :', args.eval_interval)
print(train_step % args.eval_interval)
print('*'*50)
ts = time.time()
val_loss = evaluate(va_iter)
print('evaluation use time {} s'.format(time.time()-ts))
logging('-' * 100)
log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
'| valid loss {:5.2f}'.format(
train_step // args.eval_interval, train_step,
(time.time() - ts), val_loss)
if args.dataset in ['enwik8', 'text8']:
log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
else:
log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
logging(log_str)
logging('-' * 100)
if not best_val_loss or val_loss < best_val_loss:
if not args.debug:
with open('model.pt', 'wb') as f:
torch.save(model.state_dict(), f)
with open('optimizer.pt', 'wb') as f:
torch.save(optimizer.state_dict(), f)
best_val_loss = val_loss
if args.scheduler == 'dev_perf':
scheduler.step(val_loss)
if args.sample_softmax > 0:
scheduler_sparse.step(val_loss)
eval_start_time = time.time()
if train_step == args.max_step:
sys.exit()
try:
for epoch in itertools.count(start=1):
train()
if train_step == args.max_step:
logging('-' * 100)
logging('End of training')
sys.exit()
except KeyboardInterrupt:
logging('-' * 100)
logging('Exiting from training early')
if __name__ == '__main__':
global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
train_step = 0
train_loss = 0
best_val_loss = None
log_start_time = time.time()
eval_start_time = time.time()
main()