05360171创建于 2022年3月18日历史提交
diff --git a/detection/tools/dist_test.sh b/detection/tools/dist_test.sh
old mode 100755
new mode 100644
diff --git a/detection/tools/dist_train.sh b/detection/tools/dist_train.sh
old mode 100755
new mode 100644
diff --git a/detection/tools/slurm_test.sh b/detection/tools/slurm_test.sh
old mode 100755
new mode 100644
diff --git a/detection/tools/slurm_train.sh b/detection/tools/slurm_train.sh
old mode 100755
new mode 100644
diff --git a/semantic_segmentation/tools/dist_test.sh b/semantic_segmentation/tools/dist_test.sh
old mode 100755
new mode 100644
diff --git a/semantic_segmentation/tools/dist_train.sh b/semantic_segmentation/tools/dist_train.sh
old mode 100755
new mode 100644
diff --git a/semantic_segmentation/tools/slurm_test.sh b/semantic_segmentation/tools/slurm_test.sh
old mode 100755
new mode 100644
diff --git a/semantic_segmentation/tools/slurm_train.sh b/semantic_segmentation/tools/slurm_train.sh
old mode 100755
new mode 100644
diff --git a/xcit.py b/xcit.py
index 8a7a7da..f499f3b 100644
--- a/xcit.py
+++ b/xcit.py
@@ -17,6 +17,7 @@ from timm.models.registry import register_model
 from timm.models.layers import DropPath, trunc_normal_, to_2tuple
 
 
+FP16_enabled=False
 class PositionalEncodingFourier(nn.Module):
     """
     Positional encoding relying on a fourier kernel matching the one used in the
@@ -34,14 +35,24 @@ class PositionalEncodingFourier(nn.Module):
 
     def forward(self, B, H, W):
         mask = torch.zeros(B, H, W).bool().to(self.token_projection.weight.device)
-        not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
-        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        # modified for fp16 onnx transfer
+        if FP16_enabled:
+            not_mask = (~mask).type(torch.int)
+            y_embed = not_mask.cumsum(1)
+            x_embed = not_mask.cumsum(2)
+            y_embed = y_embed.type(torch.float32)
+            x_embed = x_embed.type(torch.float32)
+        else:
+            # origin version
+            not_mask = ~mask
+            y_embed = not_mask.cumsum(1, dtype=torch.float32)
+            x_embed = not_mask.cumsum(2, dtype=torch.float32)
+
         eps = 1e-6
         y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
         x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
 
-        dim_t = torch.arange(self.hidden_dim, dtype=torch.float32, device=mask.device)
+        dim_t = torch.arange(self.hidden_dim,dtype=torch.float32,device=mask.device)
         dim_t = self.temperature ** (2 * (dim_t // 2) / self.hidden_dim)
 
         pos_x = x_embed[:, :, :, None] / dim_t
@@ -51,25 +62,26 @@ class PositionalEncodingFourier(nn.Module):
         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(),
                              pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        #pos = pos.type_as(self.token_projection.weight)
         pos = self.token_projection(pos)
         return pos
 
 
-def conv3x3(in_planes, out_planes, stride=1):
+def conv3x3(in_planes, out_planes, stride=1, on_cpu=False):
     """3x3 convolution with padding"""
-    return torch.nn.Sequential(
-        nn.Conv2d(
-            in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
-        ),
-        nn.SyncBatchNorm(out_planes)
-    )
-
+    layers=torch.nn.Sequential()
+    layers.add_module("0",nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) )
+    if on_cpu:
+        layers.add_module("1",nn.BatchNorm2d(out_planes) )
+    else:
+        layers.add_module("1",nn.SyncBatchNorm(out_planes) )
+    return layers
 
 class ConvPatchEmbed(nn.Module):
     """ Image to Patch Embedding using multiple convolutional layers
     """
 
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, on_cpu=False):
         super().__init__()
         img_size = to_2tuple(img_size)
         patch_size = to_2tuple(patch_size)
@@ -80,27 +92,26 @@ class ConvPatchEmbed(nn.Module):
 
         if patch_size[0] == 16:
             self.proj = torch.nn.Sequential(
-                conv3x3(3, embed_dim // 8, 2),
+                conv3x3(3, embed_dim // 8, 2, on_cpu),
                 nn.GELU(),
-                conv3x3(embed_dim // 8, embed_dim // 4, 2),
+                conv3x3(embed_dim // 8, embed_dim // 4, 2, on_cpu),
                 nn.GELU(),
-                conv3x3(embed_dim // 4, embed_dim // 2, 2),
+                conv3x3(embed_dim // 4, embed_dim // 2, 2, on_cpu),
                 nn.GELU(),
-                conv3x3(embed_dim // 2, embed_dim, 2),
+                conv3x3(embed_dim // 2, embed_dim, 2, on_cpu),
             )
         elif patch_size[0] == 8:
             self.proj = torch.nn.Sequential(
-                conv3x3(3, embed_dim // 4, 2),
+                conv3x3(3, embed_dim // 4, 2, on_cpu),
                 nn.GELU(),
-                conv3x3(embed_dim // 4, embed_dim // 2, 2),
+                conv3x3(embed_dim // 4, embed_dim // 2, 2, on_cpu),
                 nn.GELU(),
-                conv3x3(embed_dim // 2, embed_dim, 2),
+                conv3x3(embed_dim // 2, embed_dim, 2, on_cpu),
             )
         else:
             raise("For convolutional projection, patch size has to be in [8, 16]")
 
     def forward(self, x, padding_size=None):
-        B, C, H, W = x.shape
         x = self.proj(x)
         Hp, Wp = x.shape[2], x.shape[3]
         x = x.flatten(2).transpose(1, 2)
@@ -116,7 +127,7 @@ class LPI(nn.Module):
     """
 
     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
-                 drop=0., kernel_size=3):
+                 drop=0., kernel_size=3, on_cpu=False):
         super().__init__()
         out_features = out_features or in_features
 
@@ -125,7 +136,10 @@ class LPI(nn.Module):
         self.conv1 = torch.nn.Conv2d(in_features, out_features, kernel_size=kernel_size,
                                      padding=padding, groups=out_features)
         self.act = act_layer()
-        self.bn = nn.SyncBatchNorm(in_features)
+        if on_cpu:
+            self.bn = nn.BatchNorm2d(in_features)
+        else:
+            self.bn = nn.SyncBatchNorm(in_features)
         self.conv2 = torch.nn.Conv2d(in_features, out_features, kernel_size=kernel_size,
                                      padding=padding, groups=out_features)
 
@@ -264,7 +278,7 @@ class XCA(nn.Module):
 class XCABlock(nn.Module):
     def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0.,
                  attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
-                 num_tokens=196, eta=None):
+                 num_tokens=196, eta=None, on_cpu=False):
         super().__init__()
         self.norm1 = norm_layer(dim)
         self.attn = XCA(
@@ -279,7 +293,7 @@ class XCABlock(nn.Module):
                        drop=drop)
 
         self.norm3 = norm_layer(dim)
-        self.local_mp = LPI(in_features=dim, act_layer=act_layer)
+        self.local_mp = LPI(in_features=dim, act_layer=act_layer, on_cpu=on_cpu)
 
         self.gamma1 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)
         self.gamma2 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)
@@ -302,7 +316,7 @@ class XCiT(nn.Module):
     def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768,
                  depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None,
                  drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None,
-                 cls_attn_layers=2, use_pos=True, patch_proj='linear', eta=None, tokens_norm=False):
+                 cls_attn_layers=2, use_pos=True, patch_proj='linear', eta=None, tokens_norm=False, on_cpu=False,fp16=False):
         """
         Args:
             img_size (int, tuple): input image size
@@ -325,12 +339,15 @@ class XCiT(nn.Module):
             tokens_norm: (bool) Whether to normalize all tokens or just the cls_token in the CA
         """
         super().__init__()
+        if fp16:
+            global FP16_enabled
+            FP16_enabled = True
         self.num_classes = num_classes
         self.num_features = self.embed_dim = embed_dim
         norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
 
         self.patch_embed = ConvPatchEmbed(img_size=img_size, embed_dim=embed_dim,
-                                          patch_size=patch_size)
+                                          patch_size=patch_size, on_cpu=on_cpu)
 
         num_patches = self.patch_embed.num_patches
 
@@ -342,7 +359,7 @@ class XCiT(nn.Module):
             XCABlock(
                 dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
                 qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
-                norm_layer=norm_layer, num_tokens=num_patches, eta=eta)
+                norm_layer=norm_layer, num_tokens=num_patches, eta=eta, on_cpu=on_cpu)
             for i in range(depth)])
 
         self.cls_attn_blocks = nn.ModuleList([