"""
@author: sherlock
@contact: sherlockliao01@gmail.com
"""
import argparse
import os
import sys
import torch
if torch.__version__ >= '1.8':
import torch_npu
from torch.backends import cudnn
import torch.nn as nn
sys.path.append('.')
from config import cfg
from data import make_data_loader_dist
from engine.trainer import do_train, do_train_with_center
from modeling import build_model
from layers import make_loss, make_loss_with_center
from solver import make_optimizer, make_optimizer_with_center, WarmupMultiStepLR
from utils.logger import setup_logger
import torch.distributed as dist
from apex import amp
def train(rank, cfg, args):
dist.init_process_group(
backend='hccl',
world_size=args.world_size,
rank=rank)
train_loader, val_loader, num_query, num_classes = make_data_loader_dist(cfg, args, rank)
model = build_model(cfg, num_classes).npu()
if cfg.MODEL.IF_WITH_CENTER == 'no':
print('Train without center loss, the loss type is', cfg.MODEL.METRIC_LOSS_TYPE)
loss_func = make_loss(cfg, num_classes)
optimizer = make_optimizer(cfg, model)
if cfg.MODEL.PRETRAIN_CHOICE == 'self':
start_epoch = eval(cfg.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')[-1])
print('Start epoch:', start_epoch)
path_to_optimizer = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer')
print('Path to the checkpoint of optimizer:', path_to_optimizer)
model.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH))
optimizer.load_state_dict(torch.load(path_to_optimizer))
scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD, start_epoch)
elif cfg.MODEL.PRETRAIN_CHOICE == 'imagenet':
start_epoch = 0
scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD)
else:
print('Only support pretrain_choice for imagenet and self, but got {}'.format(cfg.MODEL.PRETRAIN_CHOICE))
if "npu" in cfg.MODEL.DEVICE:
model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale)
do_train(
cfg,
model,
train_loader,
val_loader,
optimizer,
scheduler,
loss_func,
num_query,
start_epoch
)
elif cfg.MODEL.IF_WITH_CENTER == 'yes':
print('Train with center loss, the loss type is', cfg.MODEL.METRIC_LOSS_TYPE)
loss_func, center_criterion = make_loss_with_center(cfg, num_classes)
optimizer, optimizer_center = make_optimizer_with_center(cfg, model, center_criterion)
if cfg.MODEL.PRETRAIN_CHOICE == 'self':
start_epoch = eval(cfg.MODEL.PRETRAIN_PATH.split('/')[-1].split('.')[0].split('_')[-1])
print('Start epoch:', start_epoch)
path_to_optimizer = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer')
print('Path to the checkpoint of optimizer:', path_to_optimizer)
path_to_center_param = cfg.MODEL.PRETRAIN_PATH.replace('model', 'center_param')
print('Path to the checkpoint of center_param:', path_to_center_param)
path_to_optimizer_center = cfg.MODEL.PRETRAIN_PATH.replace('model', 'optimizer_center')
print('Path to the checkpoint of optimizer_center:', path_to_optimizer_center)
model.load_state_dict(torch.load(cfg.MODEL.PRETRAIN_PATH))
optimizer.load_state_dict(torch.load(path_to_optimizer))
center_criterion.load_state_dict(torch.load(path_to_center_param))
optimizer_center.load_state_dict(torch.load(path_to_optimizer_center))
scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD, start_epoch)
elif cfg.MODEL.PRETRAIN_CHOICE == 'imagenet':
start_epoch = 0
scheduler = WarmupMultiStepLR(optimizer, cfg.SOLVER.STEPS, cfg.SOLVER.GAMMA, cfg.SOLVER.WARMUP_FACTOR,
cfg.SOLVER.WARMUP_ITERS, cfg.SOLVER.WARMUP_METHOD)
else:
print('Only support pretrain_choice for imagenet and self, but got {}'.format(cfg.MODEL.PRETRAIN_CHOICE))
if "npu" in cfg.MODEL.DEVICE:
model, [optimizer, optimizer_center] = amp.initialize(model, [optimizer, optimizer_center], opt_level="O2", loss_scale=args.loss_scale, combine_grad=True)
model = nn.parallel.DistributedDataParallel(model, device_ids=[rank], broadcast_buffers=False)
do_train_with_center(
cfg,
model,
center_criterion,
train_loader,
val_loader,
optimizer,
optimizer_center,
scheduler,
loss_func,
num_query,
start_epoch,
args.world_size,
rank
)
else:
print("Unsupported value for cfg.MODEL.IF_WITH_CENTER {}, only support yes or no!\n".format(cfg.MODEL.IF_WITH_CENTER))
def main():
parser = argparse.ArgumentParser(description="ReID Baseline Training")
parser.add_argument(
"--config_file", default="", help="path to config file", type=str
)
parser.add_argument('--loss_scale', default="dynamic", type=str)
parser.add_argument('-g', '--npus', default=1, type=int,
help='number of gpus per node')
parser.add_argument('-r', '--local_rank', default=0, type=int,
help='ranking within the npus')
parser.add_argument("opts", help="Modify config options using the command-line", default=None,
nargs=argparse.REMAINDER)
args = parser.parse_args()
args.world_size = args.npus
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '22222'
if args.config_file != "":
cfg.merge_from_file(args.config_file)
cfg.merge_from_list(args.opts)
cfg.freeze()
output_dir = cfg.OUTPUT_DIR
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
logger = setup_logger("reid_baseline", output_dir, 0)
logger.info("Using {} NPUS".format(args.world_size))
logger.info(args)
if args.config_file != "":
logger.info("Loaded configuration file {}".format(args.config_file))
with open(args.config_file, 'r') as cf:
config_str = "\n" + cf.read()
logger.info(config_str)
logger.info("Running with config:\n{}".format(cfg))
torch.npu.set_device('npu:{}'.format(args.local_rank))
train(args.local_rank, cfg, args)
if __name__ == '__main__':
main()