old mode 100755
new mode 100644
old mode 100755
new mode 100644
old mode 100755
new mode 100644
old mode 100755
new mode 100644
old mode 100755
new mode 100644
old mode 100755
new mode 100644
old mode 100755
new mode 100644
old mode 100755
new mode 100644
@@ -17,6 +17,7 @@ from timm.models.registry import register_model
from timm.models.layers import DropPath, trunc_normal_, to_2tuple
+FP16_enabled=False
class PositionalEncodingFourier(nn.Module):
"""
Positional encoding relying on a fourier kernel matching the one used in the
@@ -34,14 +35,24 @@ class PositionalEncodingFourier(nn.Module):
def forward(self, B, H, W):
mask = torch.zeros(B, H, W).bool().to(self.token_projection.weight.device)
- not_mask = ~mask
- y_embed = not_mask.cumsum(1, dtype=torch.float32)
- x_embed = not_mask.cumsum(2, dtype=torch.float32)
+ # modified for fp16 onnx transfer
+ if FP16_enabled:
+ not_mask = (~mask).type(torch.int)
+ y_embed = not_mask.cumsum(1)
+ x_embed = not_mask.cumsum(2)
+ y_embed = y_embed.type(torch.float32)
+ x_embed = x_embed.type(torch.float32)
+ else:
+ # origin version
+ not_mask = ~mask
+ y_embed = not_mask.cumsum(1, dtype=torch.float32)
+ x_embed = not_mask.cumsum(2, dtype=torch.float32)
+
eps = 1e-6
y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
- dim_t = torch.arange(self.hidden_dim, dtype=torch.float32, device=mask.device)
+ dim_t = torch.arange(self.hidden_dim,dtype=torch.float32,device=mask.device)
dim_t = self.temperature ** (2 * (dim_t // 2) / self.hidden_dim)
pos_x = x_embed[:, :, :, None] / dim_t
@@ -51,25 +62,26 @@ class PositionalEncodingFourier(nn.Module):
pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(),
pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+ #pos = pos.type_as(self.token_projection.weight)
pos = self.token_projection(pos)
return pos
-def conv3x3(in_planes, out_planes, stride=1):
+def conv3x3(in_planes, out_planes, stride=1, on_cpu=False):
"""3x3 convolution with padding"""
- return torch.nn.Sequential(
- nn.Conv2d(
- in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
- ),
- nn.SyncBatchNorm(out_planes)
- )
-
+ layers=torch.nn.Sequential()
+ layers.add_module("0",nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) )
+ if on_cpu:
+ layers.add_module("1",nn.BatchNorm2d(out_planes) )
+ else:
+ layers.add_module("1",nn.SyncBatchNorm(out_planes) )
+ return layers
class ConvPatchEmbed(nn.Module):
""" Image to Patch Embedding using multiple convolutional layers
"""
- def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, on_cpu=False):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
@@ -80,27 +92,26 @@ class ConvPatchEmbed(nn.Module):
if patch_size[0] == 16:
self.proj = torch.nn.Sequential(
- conv3x3(3, embed_dim // 8, 2),
+ conv3x3(3, embed_dim // 8, 2, on_cpu),
nn.GELU(),
- conv3x3(embed_dim // 8, embed_dim // 4, 2),
+ conv3x3(embed_dim // 8, embed_dim // 4, 2, on_cpu),
nn.GELU(),
- conv3x3(embed_dim // 4, embed_dim // 2, 2),
+ conv3x3(embed_dim // 4, embed_dim // 2, 2, on_cpu),
nn.GELU(),
- conv3x3(embed_dim // 2, embed_dim, 2),
+ conv3x3(embed_dim // 2, embed_dim, 2, on_cpu),
)
elif patch_size[0] == 8:
self.proj = torch.nn.Sequential(
- conv3x3(3, embed_dim // 4, 2),
+ conv3x3(3, embed_dim // 4, 2, on_cpu),
nn.GELU(),
- conv3x3(embed_dim // 4, embed_dim // 2, 2),
+ conv3x3(embed_dim // 4, embed_dim // 2, 2, on_cpu),
nn.GELU(),
- conv3x3(embed_dim // 2, embed_dim, 2),
+ conv3x3(embed_dim // 2, embed_dim, 2, on_cpu),
)
else:
raise("For convolutional projection, patch size has to be in [8, 16]")
def forward(self, x, padding_size=None):
- B, C, H, W = x.shape
x = self.proj(x)
Hp, Wp = x.shape[2], x.shape[3]
x = x.flatten(2).transpose(1, 2)
@@ -116,7 +127,7 @@ class LPI(nn.Module):
"""
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU,
- drop=0., kernel_size=3):
+ drop=0., kernel_size=3, on_cpu=False):
super().__init__()
out_features = out_features or in_features
@@ -125,7 +136,10 @@ class LPI(nn.Module):
self.conv1 = torch.nn.Conv2d(in_features, out_features, kernel_size=kernel_size,
padding=padding, groups=out_features)
self.act = act_layer()
- self.bn = nn.SyncBatchNorm(in_features)
+ if on_cpu:
+ self.bn = nn.BatchNorm2d(in_features)
+ else:
+ self.bn = nn.SyncBatchNorm(in_features)
self.conv2 = torch.nn.Conv2d(in_features, out_features, kernel_size=kernel_size,
padding=padding, groups=out_features)
@@ -264,7 +278,7 @@ class XCA(nn.Module):
class XCABlock(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0.,
attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
- num_tokens=196, eta=None):
+ num_tokens=196, eta=None, on_cpu=False):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = XCA(
@@ -279,7 +293,7 @@ class XCABlock(nn.Module):
drop=drop)
self.norm3 = norm_layer(dim)
- self.local_mp = LPI(in_features=dim, act_layer=act_layer)
+ self.local_mp = LPI(in_features=dim, act_layer=act_layer, on_cpu=on_cpu)
self.gamma1 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)
self.gamma2 = nn.Parameter(eta * torch.ones(dim), requires_grad=True)
@@ -302,7 +316,7 @@ class XCiT(nn.Module):
def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768,
depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None,
drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None,
- cls_attn_layers=2, use_pos=True, patch_proj='linear', eta=None, tokens_norm=False):
+ cls_attn_layers=2, use_pos=True, patch_proj='linear', eta=None, tokens_norm=False, on_cpu=False,fp16=False):
"""
Args:
img_size (int, tuple): input image size
@@ -325,12 +339,15 @@ class XCiT(nn.Module):
tokens_norm: (bool) Whether to normalize all tokens or just the cls_token in the CA
"""
super().__init__()
+ if fp16:
+ global FP16_enabled
+ FP16_enabled = True
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim
norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
self.patch_embed = ConvPatchEmbed(img_size=img_size, embed_dim=embed_dim,
- patch_size=patch_size)
+ patch_size=patch_size, on_cpu=on_cpu)
num_patches = self.patch_embed.num_patches
@@ -342,7 +359,7 @@ class XCiT(nn.Module):
XCABlock(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i],
- norm_layer=norm_layer, num_tokens=num_patches, eta=eta)
+ norm_layer=norm_layer, num_tokens=num_patches, eta=eta, on_cpu=on_cpu)
for i in range(depth)])
self.cls_attn_blocks = nn.ModuleList([