from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import _init_paths
import os
import json
import torch
if torch.__version__ >= "1.8":
import torch_npu
import torch.utils.data
from torchvision.transforms import transforms as T
from opts import opts
from models.model import create_model, load_model, save_model
from models.data_parallel import DataParallel
from logger import Logger
from datasets.dataset_factory import get_dataset
from trains.train_factory import train_factory
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from apex import amp
def train(opt):
torch.manual_seed(opt.seed)
torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
rank = opt.rank
print(opt)
torch.distributed.init_process_group(backend='hccl', init_method="tcp://127.0.0.1:29688", world_size=opt.world_size, rank=rank)
print('Setting up data...')
if opt.use_npu:
loc = "npu:{}".format(rank)
torch.npu.set_device(loc)
Dataset = get_dataset(opt.dataset, opt.task)
f = open(opt.data_cfg)
data_config = json.load(f)
trainset_paths = data_config['train']
dataset_root = data_config['root']
f.close()
transforms = T.Compose([T.ToTensor()])
dataset = Dataset(opt, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms)
opt = opts().update_dataset_info_and_set_heads(opt, dataset)
print(opt)
if opt.rank == 0:
logger = Logger(opt)
opt.device = torch.device(loc)
print('Creating model...')
model = create_model(opt.arch, opt.heads, opt.head_conv)
optimizer = torch.optim.Adam(model.parameters(), opt.lr)
model = model.to(loc)
if opt.use_amp:
model, optimizer = amp.initialize(model, optimizer, opt_level="O1", loss_scale=4096)
start_epoch = 0
train_sampler = DistributedSampler(dataset)
train_loader = torch.utils.data.DataLoader(
dataset = dataset,
batch_size=opt.batch_size,
shuffle=(train_sampler is None),
num_workers=opt.num_workers,
sampler= train_sampler,
pin_memory=True,
drop_last=True
)
print('Starting training...')
Trainer = train_factory[opt.task]
trainer = Trainer(opt, model, optimizer)
trainer.set_device(rank, opt)
if opt.load_model != '':
model, optimizer, start_epoch = load_model(
model, opt.load_model, trainer.optimizer, opt.resume, opt.lr, opt.lr_step)
for epoch in range(start_epoch + 1, opt.num_epochs + 1):
train_sampler.set_epoch(epoch)
mark = epoch if opt.save_all else 'last'
log_dict_train, _ = trainer.train(epoch, train_loader)
if opt.rank == 0:
logger.write('epoch: {} |'.format(epoch))
for k, v in log_dict_train.items():
logger.scalar_summary('train_{}'.format(k), v, epoch)
logger.write('{} {:8f} | '.format(k, v))
if opt.val_intervals > 0 and epoch % opt.val_intervals == 0 and opt.rank == 0:
save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
epoch, model, optimizer)
else:
if opt.rank == 0:
save_model(os.path.join(opt.save_dir, 'model_last.pth'),
epoch, model, optimizer)
if opt.rank == 0:
logger.write('\n')
if epoch in opt.lr_step:
if opt.rank == 0:
save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
epoch, model, optimizer)
lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
print('Drop LR to', lr)
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if epoch % 5 == 0 or epoch >= 25 and opt.rank == 0:
save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
epoch, model, optimizer)
if opt.rank == 0:
logger.close()
def main():
opt = opts().parse()
train(opt)
if __name__ == '__main__':
main()