diff --git a/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py b/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py

index e9eb3579..42585f1d 100644

--- a/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py

+++ b/mmdet/core/bbox/coder/delta_xywh_bbox_coder.py

@@ -1,3 +1,18 @@

+# Copyright 2021 Huawei Technologies Co., Ltd

+#

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

+

 import numpy as np

 import torch

 

@@ -168,8 +183,13 @@ def delta2bbox(rois,

                 [0.0000, 0.3161, 4.1945, 0.6839],

                 [5.0000, 5.0000, 5.0000, 5.0000]])

     """

-    means = deltas.new_tensor(means).view(1, -1).repeat(1, deltas.size(1) // 4)

-    stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(1) // 4)

+    # fix shape for means and stds for onnx

+    if torch.onnx.is_in_onnx_export():

+        means = deltas.new_tensor(means).view(1, -1).repeat(1, deltas.size(1).numpy() // 4)

+        stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(1).numpy() // 4)

+    else:

+        means = deltas.new_tensor(means).view(1, -1).repeat(1, deltas.size(1) // 4)

+        stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(1) // 4)

     denorm_deltas = deltas * stds + means

     dx = denorm_deltas[:, 0::4]

     dy = denorm_deltas[:, 1::4]

@@ -178,12 +198,22 @@ def delta2bbox(rois,

     max_ratio = np.abs(np.log(wh_ratio_clip))

     dw = dw.clamp(min=-max_ratio, max=max_ratio)

     dh = dh.clamp(min=-max_ratio, max=max_ratio)

-    # Compute center of each roi

-    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)

-    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)

-    # Compute width/height of each roi

-    pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw)

-    ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh)

+    # improve gather performance on NPU

+    if torch.onnx.is_in_onnx_export():

+        rois_perf = rois.permute(1, 0)

+        # Compute center of each roi

+        px = ((rois_perf[0, :] + rois_perf[2, :]) * 0.5).unsqueeze(1).expand_as(dx)

+        py = ((rois_perf[1, :] + rois_perf[3, :]) * 0.5).unsqueeze(1).expand_as(dy)

+        # Compute width/height of each roi

+        pw = (rois_perf[2, :] - rois_perf[0, :]).unsqueeze(1).expand_as(dw)

+        ph = (rois_perf[3, :] - rois_perf[1, :]).unsqueeze(1).expand_as(dh)

+    else:

+        # Compute center of each roi

+        px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)

+        py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)

+        # Compute width/height of each roi

+        pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw)

+        ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh)

     # Use exp(network energy) to enlarge/shrink each roi

     gw = pw * dw.exp()

     gh = ph * dh.exp()

diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py

index 463fe2e4..a0e2cc53 100644

--- a/mmdet/core/post_processing/bbox_nms.py

+++ b/mmdet/core/post_processing/bbox_nms.py

@@ -1,9 +1,74 @@

+# Copyright 2021 Huawei Technologies Co., Ltd

+#

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

 import torch

 from mmcv.ops.nms import batched_nms

 

 from mmdet.core.bbox.iou_calculators import bbox_overlaps

 

 

+class BatchNMSOp(torch.autograd.Function):

+    @staticmethod

+    def forward(ctx, bboxes, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size):

+        """

+        boxes (torch.Tensor): boxes in shape (batch, N, C, 4).

+        scores (torch.Tensor): scores in shape (batch, N, C).

+        return:

+            nmsed_boxes: (1, N, 4)

+            nmsed_scores: (1, N)

+            nmsed_classes: (1, N)

+            nmsed_num: (1,)

+        """

+

+        # Phony implementation for onnx export

+        nmsed_boxes = bboxes[:, :max_total_size, 0, :]

+        nmsed_scores = scores[:, :max_total_size, 0]

+        nmsed_classes = torch.arange(max_total_size, dtype=torch.long)

+        nmsed_num = torch.Tensor([max_total_size])

+

+        return nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num

+

+    @staticmethod

+    def symbolic(g, bboxes, scores, score_thr, iou_thr, max_size_p_class, max_t_size):

+        nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = g.op('BatchMultiClassNMS',

+            bboxes, scores, score_threshold_f=score_thr, iou_threshold_f=iou_thr,

+            max_size_per_class_i=max_size_p_class, max_total_size_i=max_t_size, outputs=4)

+        return nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num

+

+def batch_nms_op(bboxes, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size):

+    """

+    boxes (torch.Tensor): boxes in shape (N, 4).

+    scores (torch.Tensor): scores in shape (N, ).

+    """

+

+    if bboxes.dtype == torch.float32:

+        bboxes = bboxes.reshape(1, bboxes.shape[0].numpy(), -1, 4).half()

+        scores = scores.reshape(1, scores.shape[0].numpy(), -1).half()

+    else:

+        bboxes = bboxes.reshape(1, bboxes.shape[0].numpy(), -1, 4)

+        scores = scores.reshape(1, scores.shape[0].numpy(), -1)

+

+    nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = BatchNMSOp.apply(bboxes, scores,

+        score_threshold, iou_threshold, max_size_per_class, max_total_size)

+    nmsed_boxes = nmsed_boxes.float()

+    nmsed_scores = nmsed_scores.float()

+    nmsed_classes = nmsed_classes.long()

+    dets = torch.cat((nmsed_boxes.reshape((max_total_size, 4)), nmsed_scores.reshape((max_total_size, 1))), -1)

+    labels = nmsed_classes.reshape((max_total_size, ))

+    return dets, labels

+

+

 def multiclass_nms(multi_bboxes,

                    multi_scores,

                    score_thr,

@@ -36,13 +101,25 @@ def multiclass_nms(multi_bboxes,

     if multi_bboxes.shape[1] > 4:

         bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)

     else:

-        bboxes = multi_bboxes[:, None].expand(

-            multi_scores.size(0), num_classes, 4)

+        # export expand operator to onnx more nicely

+        if torch.onnx.is_in_onnx_export:

+            bbox_shape_tensor = torch.ones(multi_scores.size(0), num_classes, 4)

+            bboxes = multi_bboxes[:, None].expand_as(bbox_shape_tensor)

+        else:

+            bboxes = multi_bboxes[:, None].expand(

+                multi_scores.size(0), num_classes, 4)

+

 

     scores = multi_scores[:, :-1]

     if score_factors is not None:

         scores = scores * score_factors[:, None]

 

+    # npu

+    if torch.onnx.is_in_onnx_export():

+        dets, labels = batch_nms_op(bboxes, scores, score_thr, nms_cfg.get("iou_threshold"), max_num, max_num)

+        return dets, labels

+

+    # cpu and gpu

     labels = torch.arange(num_classes, dtype=torch.long)

     labels = labels.view(1, -1).expand_as(scores)

 

@@ -53,6 +130,8 @@ def multiclass_nms(multi_bboxes,

     # remove low scoring boxes

     valid_mask = scores > score_thr

     inds = valid_mask.nonzero(as_tuple=False).squeeze(1)

+    # vals, inds = torch.topk(scores, 1000)

+

     bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]

     if inds.numel() == 0:

         if torch.onnx.is_in_onnx_export():

@@ -76,6 +155,7 @@ def multiclass_nms(multi_bboxes,

         return dets, labels[keep]

 

 

+

 def fast_nms(multi_bboxes,

              multi_scores,

              multi_coeffs,

diff --git a/mmdet/models/dense_heads/rpn_head.py b/mmdet/models/dense_heads/rpn_head.py

index f565d1a4..7fdb0861 100644

--- a/mmdet/models/dense_heads/rpn_head.py

+++ b/mmdet/models/dense_heads/rpn_head.py

@@ -1,3 +1,18 @@

+# Copyright 2021 Huawei Technologies Co., Ltd

+#

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

+

 import torch

 import torch.nn as nn

 import torch.nn.functional as F

@@ -9,6 +24,56 @@ from .anchor_head import AnchorHead

 from .rpn_test_mixin import RPNTestMixin

 

 

+class BatchNMSOp(torch.autograd.Function):

+    @staticmethod

+    def forward(ctx, bboxes, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size):

+        """

+        boxes (torch.Tensor): boxes in shape (batch, N, C, 4).

+        scores (torch.Tensor): scores in shape (batch, N, C).

+        return:

+            nmsed_boxes: (1, N, 4)

+            nmsed_scores: (1, N)

+            nmsed_classes: (1, N)

+            nmsed_num: (1,)

+        """

+

+        # Phony implementation for onnx export

+        nmsed_boxes = bboxes[:, :max_total_size, 0, :]

+        nmsed_scores = scores[:, :max_total_size, 0]

+        nmsed_classes = torch.arange(max_total_size, dtype=torch.long)

+        nmsed_num = torch.Tensor([max_total_size])

+

+        return nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num

+

+    @staticmethod

+    def symbolic(g, bboxes, scores, score_thr, iou_thr, max_size_p_class, max_t_size):

+        nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = g.op('BatchMultiClassNMS',

+            bboxes, scores, score_threshold_f=score_thr, iou_threshold_f=iou_thr,

+            max_size_per_class_i=max_size_p_class, max_total_size_i=max_t_size, outputs=4)

+        return nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num

+

+def batch_nms_op(bboxes, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size):

+    """

+    boxes (torch.Tensor): boxes in shape (N, 4).

+    scores (torch.Tensor): scores in shape (N, ).

+    """

+

+    if bboxes.dtype == torch.float32:

+        bboxes = bboxes.reshape(1, bboxes.shape[0].numpy(), -1, 4).half()

+        scores = scores.reshape(1, scores.shape[0].numpy(), -1).half()

+    else:

+        bboxes = bboxes.reshape(1, bboxes.shape[0].numpy(), -1, 4)

+        scores = scores.reshape(1, scores.shape[0].numpy(), -1)

+

+    nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = BatchNMSOp.apply(bboxes, scores,

+        score_threshold, iou_threshold, max_size_per_class, max_total_size) # max_total_size num_bbox

+    nmsed_boxes = nmsed_boxes.float()

+    nmsed_scores = nmsed_scores.float()

+    nmsed_classes = nmsed_classes.long()

+    dets = torch.cat((nmsed_boxes.reshape((max_total_size, 4)), nmsed_scores.reshape((max_total_size, 1))), -1)

+    labels = nmsed_classes.reshape((max_total_size, ))

+    return dets, labels

+

 @HEADS.register_module()

 class RPNHead(RPNTestMixin, AnchorHead):

     """RPN head.

@@ -132,9 +197,14 @@ class RPNHead(RPNTestMixin, AnchorHead):

             if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:

                 # sort is faster than topk

                 # _, topk_inds = scores.topk(cfg.nms_pre)

-                ranked_scores, rank_inds = scores.sort(descending=True)

-                topk_inds = rank_inds[:cfg.nms_pre]

-                scores = ranked_scores[:cfg.nms_pre]

+                # onnx uses topk to sort, this is simpler for onnx export

+                if torch.onnx.is_in_onnx_export():

+                    scores, topk_inds = torch.topk(scores, cfg.nms_pre)

+                else:

+                    ranked_scores, rank_inds = scores.sort(descending=True)

+                    topk_inds = rank_inds[:cfg.nms_pre]

+                    scores = ranked_scores[:cfg.nms_pre]

+

                 rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]

                 anchors = anchors[topk_inds, :]

             mlvl_scores.append(scores)

@@ -164,5 +234,11 @@ class RPNHead(RPNTestMixin, AnchorHead):

 

         # TODO: remove the hard coded nms type

         nms_cfg = dict(type='nms', iou_threshold=cfg.nms_thr)

-        dets, keep = batched_nms(proposals, scores, ids, nms_cfg)

-        return dets[:cfg.nms_post]

+        # npu return

+        if torch.onnx.is_in_onnx_export():

+            dets, labels = batch_nms_op(proposals, scores, 0.0, nms_cfg.get("iou_threshold"), cfg.nms_post, cfg.nms_post)

+            return dets

+        # cpu and gpu return

+        else:

+            dets, keep = batched_nms(proposals, scores, ids, nms_cfg)

+            return dets[:cfg.nms_post]

diff --git a/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py

index c0eebc4a..c20611bf 100644

--- a/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py

+++ b/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py

@@ -1,9 +1,48 @@

+# Copyright 2021 Huawei Technologies Co., Ltd

+#

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

+

 import torch

 from mmcv.runner import force_fp32

 

 from mmdet.models.builder import ROI_EXTRACTORS

 from .base_roi_extractor import BaseRoIExtractor

 

+import torch.onnx.symbolic_helper as sym_help

+

+class RoiExtractor(torch.autograd.Function):

+    @staticmethod

+    def forward(self, f0, f1, f2, f3, rois, aligned=1, finest_scale=56, pooled_height=7, pooled_width=7,

+                         pool_mode='avg', roi_scale_factor=0, sample_num=0, spatial_scale=[0.25, 0.125, 0.0625, 0.03125]):

+        """

+        feats (torch.Tensor): feats in shape (batch, 256, H, W).

+        rois (torch.Tensor): rois in shape (k, 5).

+        return:

+            roi_feats (torch.Tensor): (k, 256, pooled_width, pooled_width)

+        """

+

+        # phony implementation for shape inference

+        k = rois.size()[0]

+        roi_feats = torch.ones(k, 256, pooled_height, pooled_width)

+        return roi_feats

+

+    @staticmethod

+    def symbolic(g, f0, f1, f2, f3, rois):

+        # TODO: support tensor list type for feats

+        roi_feats = g.op('RoiExtractor', f0, f1, f2, f3, rois, aligned_i=1, finest_scale_i=56, pooled_height_i=7, pooled_width_i=7,

+                         pool_mode_s='avg', roi_scale_factor_i=0, sample_num_i=0, spatial_scale_f=[0.25, 0.125, 0.0625, 0.03125], outputs=1)

+        return roi_feats

 

 @ROI_EXTRACTORS.register_module()

 class SingleRoIExtractor(BaseRoIExtractor):

@@ -52,6 +91,12 @@ class SingleRoIExtractor(BaseRoIExtractor):

 

     @force_fp32(apply_to=('feats', ), out_fp16=True)

     def forward(self, feats, rois, roi_scale_factor=None):

+        # Work around to export onnx for npu

+        if torch.onnx.is_in_onnx_export():

+            roi_feats = RoiExtractor.apply(feats[0], feats[1], feats[2], feats[3], rois)

+            # roi_feats = RoiExtractor.apply(list(feats), rois)

+            return roi_feats

+

         """Forward function."""

         out_size = self.roi_layers[0].output_size

         num_levels = len(feats)

diff --git a/tools/pytorch2onnx.py b/tools/pytorch2onnx.py

index a8e7487b..81f9fde9 100644

--- a/tools/pytorch2onnx.py

+++ b/tools/pytorch2onnx.py

@@ -1,3 +1,17 @@

+# Copyright 2021 Huawei Technologies Co., Ltd

+#

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

+#

+#     http://www.apache.org/licenses/LICENSE-2.0

+#

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+

 import argparse

 import os.path as osp

 

@@ -20,7 +34,8 @@ def pytorch2onnx(config_path,

                  verify=False,

                  normalize_cfg=None,

                  dataset='coco',

-                 test_img=None):

+                 test_img=None,

+                 enable_onnx_checker=False):

 

     input_config = {

         'input_shape': input_shape,

@@ -49,7 +64,8 @@ def pytorch2onnx(config_path,

         keep_initializers_as_inputs=True,

         do_constant_folding=True,

         verbose=show,

-        opset_version=opset_version)

+        opset_version=opset_version,

+        enable_onnx_checker=False)

 

     model.forward = orig_model.forward

     print(f'Successfully exported ONNX model: {output_file}')