import torch
from torchvision import transforms
import cv2
import numpy as np
import types
from numpy import random
from math import sqrt
from data import cfg, MEANS, STD
def intersect(box_a, box_b):
max_xy = np.minimum(box_a[:, 2:], box_b[2:])
min_xy = np.maximum(box_a[:, :2], box_b[:2])
inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
return inter[:, 0] * inter[:, 1]
def jaccard_numpy(box_a, box_b):
"""Compute the jaccard overlap of two sets of boxes. The jaccard overlap
is simply the intersection over union of two boxes.
E.g.:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
Args:
box_a: Multiple bounding boxes, Shape: [num_boxes,4]
box_b: Single bounding box, Shape: [4]
Return:
jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
"""
inter = intersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1]))
area_b = ((box_b[2]-box_b[0]) *
(box_b[3]-box_b[1]))
union = area_a + area_b - inter
return inter / union
class Compose(object):
"""Composes several augmentations together.
Args:
transforms (List[Transform]): list of transforms to compose.
Example:
>>> augmentations.Compose([
>>> transforms.CenterCrop(10),
>>> transforms.ToTensor(),
>>> ])
"""
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, img, masks=None, boxes=None, labels=None):
for t in self.transforms:
img, masks, boxes, labels = t(img, masks, boxes, labels)
return img, masks, boxes, labels
class Lambda(object):
"""Applies a lambda as a transform."""
def __init__(self, lambd):
assert isinstance(lambd, types.LambdaType)
self.lambd = lambd
def __call__(self, img, masks=None, boxes=None, labels=None):
return self.lambd(img, masks, boxes, labels)
class ConvertFromInts(object):
def __call__(self, image, masks=None, boxes=None, labels=None):
return image.astype(np.float32), masks, boxes, labels
class ToAbsoluteCoords(object):
def __call__(self, image, masks=None, boxes=None, labels=None):
height, width, channels = image.shape
boxes[:, 0] *= width
boxes[:, 2] *= width
boxes[:, 1] *= height
boxes[:, 3] *= height
return image, masks, boxes, labels
class ToPercentCoords(object):
def __call__(self, image, masks=None, boxes=None, labels=None):
height, width, channels = image.shape
boxes[:, 0] /= width
boxes[:, 2] /= width
boxes[:, 1] /= height
boxes[:, 3] /= height
return image, masks, boxes, labels
class Pad(object):
"""
Pads the image to the input width and height, filling the
background with mean and putting the image in the top-left.
Note: this expects im_w <= width and im_h <= height
"""
def __init__(self, width, height, mean=MEANS, pad_gt=True):
self.mean = mean
self.width = width
self.height = height
self.pad_gt = pad_gt
def __call__(self, image, masks, boxes=None, labels=None):
im_h, im_w, depth = image.shape
expand_image = np.zeros(
(self.height, self.width, depth),
dtype=image.dtype)
expand_image[:, :, :] = self.mean
expand_image[:im_h, :im_w] = image
if self.pad_gt:
expand_masks = np.zeros(
(masks.shape[0], self.height, self.width),
dtype=masks.dtype)
expand_masks[:,:im_h,:im_w] = masks
masks = expand_masks
return expand_image, masks, boxes, labels
class Resize(object):
""" If preserve_aspect_ratio is true, this resizes to an approximate area of max_size * max_size """
@staticmethod
def calc_size_preserve_ar(img_w, img_h, max_size):
""" I mathed this one out on the piece of paper. Resulting width*height = approx max_size^2 """
ratio = sqrt(img_w / img_h)
w = max_size * ratio
h = max_size / ratio
return int(w), int(h)
def __init__(self, resize_gt=True):
self.resize_gt = resize_gt
self.max_size = cfg.max_size
self.preserve_aspect_ratio = cfg.preserve_aspect_ratio
def __call__(self, image, masks, boxes, labels=None):
img_h, img_w, _ = image.shape
if self.preserve_aspect_ratio:
width, height = Resize.calc_size_preserve_ar(img_w, img_h, self.max_size)
else:
width, height = self.max_size, self.max_size
image = cv2.resize(image, (width, height))
if self.resize_gt:
masks = masks.transpose((1, 2, 0))
masks = cv2.resize(masks, (width, height))
if len(masks.shape) == 2:
masks = np.expand_dims(masks, 0)
else:
masks = masks.transpose((2, 0, 1))
boxes[:, [0, 2]] *= (width / img_w)
boxes[:, [1, 3]] *= (height / img_h)
w = boxes[:, 2] - boxes[:, 0]
h = boxes[:, 3] - boxes[:, 1]
keep = (w > cfg.discard_box_width) * (h > cfg.discard_box_height)
masks = masks[keep]
boxes = boxes[keep]
labels['labels'] = labels['labels'][keep]
labels['num_crowds'] = (labels['labels'] < 0).sum()
return image, masks, boxes, labels
class RandomSaturation(object):
def __init__(self, lower=0.5, upper=1.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
image[:, :, 1] *= random.uniform(self.lower, self.upper)
return image, masks, boxes, labels
class RandomHue(object):
def __init__(self, delta=18.0):
assert delta >= 0.0 and delta <= 360.0
self.delta = delta
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
image[:, :, 0] += random.uniform(-self.delta, self.delta)
image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
return image, masks, boxes, labels
class RandomLightingNoise(object):
def __init__(self):
self.perms = ((0, 1, 2), (0, 2, 1),
(1, 0, 2), (1, 2, 0),
(2, 0, 1), (2, 1, 0))
def __call__(self, image, masks=None, boxes=None, labels=None):
return image, masks, boxes, labels
class ConvertColor(object):
def __init__(self, current='BGR', transform='HSV'):
self.transform = transform
self.current = current
def __call__(self, image, masks=None, boxes=None, labels=None):
if self.current == 'BGR' and self.transform == 'HSV':
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
elif self.current == 'HSV' and self.transform == 'BGR':
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
else:
raise NotImplementedError
return image, masks, boxes, labels
class RandomContrast(object):
def __init__(self, lower=0.5, upper=1.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
alpha = random.uniform(self.lower, self.upper)
image *= alpha
return image, masks, boxes, labels
class RandomBrightness(object):
def __init__(self, delta=32):
assert delta >= 0.0
assert delta <= 255.0
self.delta = delta
def __call__(self, image, masks=None, boxes=None, labels=None):
if random.randint(2):
delta = random.uniform(-self.delta, self.delta)
image += delta
return image, masks, boxes, labels
class ToCV2Image(object):
def __call__(self, tensor, masks=None, boxes=None, labels=None):
return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), masks, boxes, labels
class ToTensor(object):
def __call__(self, cvimage, masks=None, boxes=None, labels=None):
return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), masks, boxes, labels
class RandomSampleCrop(object):
"""Crop
Arguments:
img (Image): the image being input during training
boxes (Tensor): the original bounding boxes in pt form
labels (Tensor): the class labels for each bbox
mode (float tuple): the min and max jaccard overlaps
Return:
(img, boxes, classes)
img (Image): the cropped image
boxes (Tensor): the adjusted bounding boxes in pt form
labels (Tensor): the class labels for each bbox
"""
def __init__(self):
self.sample_options = (
None,
(0.1, None),
(0.3, None),
(0.7, None),
(0.9, None),
(None, None),
)
def __call__(self, image, masks, boxes=None, labels=None):
height, width, _ = image.shape
while True:
mode = random.choice(self.sample_options)
if mode is None:
return image, masks, boxes, labels
min_iou, max_iou = mode
if min_iou is None:
min_iou = float('-inf')
if max_iou is None:
max_iou = float('inf')
for _ in range(50):
current_image = image
w = random.uniform(0.3 * width, width)
h = random.uniform(0.3 * height, height)
if h / w < 0.5 or h / w > 2:
continue
left = random.uniform(width - w)
top = random.uniform(height - h)
rect = np.array([int(left), int(top), int(left+w), int(top+h)])
overlap = jaccard_numpy(boxes, rect)
if overlap.min() < min_iou and max_iou < overlap.max():
continue
current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
:]
centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
mask = m1 * m2
num_crowds = labels['num_crowds']
crowd_mask = np.zeros(mask.shape, dtype=np.int32)
if num_crowds > 0:
crowd_mask[-num_crowds:] = 1
if not mask.any() or np.sum(1-crowd_mask[mask]) == 0:
continue
current_masks = masks[mask, :, :].copy()
current_boxes = boxes[mask, :].copy()
labels['labels'] = labels['labels'][mask]
current_labels = labels
if num_crowds > 0:
labels['num_crowds'] = np.sum(crowd_mask[mask])
current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
rect[:2])
current_boxes[:, :2] -= rect[:2]
current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
rect[2:])
current_boxes[:, 2:] -= rect[:2]
current_masks = current_masks[:, rect[1]:rect[3], rect[0]:rect[2]]
return current_image, current_masks, current_boxes, current_labels
class Expand(object):
def __init__(self, mean):
self.mean = mean
def __call__(self, image, masks, boxes, labels):
if random.randint(2):
return image, masks, boxes, labels
height, width, depth = image.shape
ratio = random.uniform(1, 4)
left = random.uniform(0, width*ratio - width)
top = random.uniform(0, height*ratio - height)
expand_image = np.zeros(
(int(height*ratio), int(width*ratio), depth),
dtype=image.dtype)
expand_image[:, :, :] = self.mean
expand_image[int(top):int(top + height),
int(left):int(left + width)] = image
image = expand_image
expand_masks = np.zeros(
(masks.shape[0], int(height*ratio), int(width*ratio)),
dtype=masks.dtype)
expand_masks[:,int(top):int(top + height),
int(left):int(left + width)] = masks
masks = expand_masks
boxes = boxes.copy()
boxes[:, :2] += (int(left), int(top))
boxes[:, 2:] += (int(left), int(top))
return image, masks, boxes, labels
class RandomMirror(object):
def __call__(self, image, masks, boxes, labels):
_, width, _ = image.shape
if random.randint(2):
image = image[:, ::-1]
masks = masks[:, :, ::-1]
boxes = boxes.copy()
boxes[:, 0::2] = width - boxes[:, 2::-2]
return image, masks, boxes, labels
class RandomFlip(object):
def __call__(self, image, masks, boxes, labels):
height , _ , _ = image.shape
if random.randint(2):
image = image[::-1, :]
masks = masks[:, ::-1, :]
boxes = boxes.copy()
boxes[:, 1::2] = height - boxes[:, 3::-2]
return image, masks, boxes, labels
class RandomRot90(object):
def __call__(self, image, masks, boxes, labels):
old_height , old_width , _ = image.shape
k = random.randint(4)
image = np.rot90(image,k)
masks = np.array([np.rot90(mask,k) for mask in masks])
boxes = boxes.copy()
for _ in range(k):
boxes = np.array([[box[1], old_width - 1 - box[2], box[3], old_width - 1 - box[0]] for box in boxes])
old_width, old_height = old_height, old_width
return image, masks, boxes, labels
class SwapChannels(object):
"""Transforms a tensorized image by swapping the channels in the order
specified in the swap tuple.
Args:
swaps (int triple): final order of channels
eg: (2, 1, 0)
"""
def __init__(self, swaps):
self.swaps = swaps
def __call__(self, image):
"""
Args:
image (Tensor): image tensor to be transformed
Return:
a tensor with channels swapped according to swap
"""
image = image[:, :, self.swaps]
return image
class PhotometricDistort(object):
def __init__(self):
self.pd = [
RandomContrast(),
ConvertColor(transform='HSV'),
RandomSaturation(),
RandomHue(),
ConvertColor(current='HSV', transform='BGR'),
RandomContrast()
]
self.rand_brightness = RandomBrightness()
self.rand_light_noise = RandomLightingNoise()
def __call__(self, image, masks, boxes, labels):
im = image.copy()
im, masks, boxes, labels = self.rand_brightness(im, masks, boxes, labels)
if random.randint(2):
distort = Compose(self.pd[:-1])
else:
distort = Compose(self.pd[1:])
im, masks, boxes, labels = distort(im, masks, boxes, labels)
return self.rand_light_noise(im, masks, boxes, labels)
class PrepareMasks(object):
"""
Prepares the gt masks for use_gt_bboxes by cropping with the gt box
and downsampling the resulting mask to mask_size, mask_size. This
function doesn't do anything if cfg.use_gt_bboxes is False.
"""
def __init__(self, mask_size, use_gt_bboxes):
self.mask_size = mask_size
self.use_gt_bboxes = use_gt_bboxes
def __call__(self, image, masks, boxes, labels=None):
if not self.use_gt_bboxes:
return image, masks, boxes, labels
height, width, _ = image.shape
new_masks = np.zeros((masks.shape[0], self.mask_size ** 2))
for i in range(len(masks)):
x1, y1, x2, y2 = boxes[i, :]
x1 *= width
x2 *= width
y1 *= height
y2 *= height
x1, y1, x2, y2 = (int(x1), int(y1), int(x2), int(y2))
cropped_mask = masks[i, y1:(y2+1), x1:(x2+1)]
scaled_mask = cv2.resize(cropped_mask, (self.mask_size, self.mask_size))
new_masks[i, :] = scaled_mask.reshape(1, -1)
new_masks[new_masks > 0.5] = 1
new_masks[new_masks <= 0.5] = 0
return image, new_masks, boxes, labels
class BackboneTransform(object):
"""
Transforms a BRG image made of floats in the range [0, 255] to whatever
input the current backbone network needs.
transform is a transform config object (see config.py).
in_channel_order is probably 'BGR' but you do you, kid.
"""
def __init__(self, transform, mean, std, in_channel_order):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.transform = transform
self.channel_map = {c: idx for idx, c in enumerate(in_channel_order)}
self.channel_permutation = [self.channel_map[c] for c in transform.channel_order]
def __call__(self, img, masks=None, boxes=None, labels=None):
img = img.astype(np.float32)
if self.transform.normalize:
img = (img - self.mean) / self.std
elif self.transform.subtract_means:
img = (img - self.mean)
elif self.transform.to_float:
img = img / 255
img = img[:, :, self.channel_permutation]
return img.astype(np.float32), masks, boxes, labels
class BaseTransform(object):
""" Transorm to be used when evaluating. """
def __init__(self, mean=MEANS, std=STD):
self.augment = Compose([
ConvertFromInts(),
Resize(resize_gt=False),
BackboneTransform(cfg.backbone.transform, mean, std, 'BGR')
])
def __call__(self, img, masks=None, boxes=None, labels=None):
return self.augment(img, masks, boxes, labels)
import torch.nn.functional as F
class FastBaseTransform(torch.nn.Module):
"""
Transform that does all operations on the GPU for super speed.
This doesn't suppport a lot of config settings and should only be used for production.
Maintain this as necessary.
"""
def __init__(self):
super().__init__()
self.mean = torch.Tensor(MEANS).float().cuda()[None, :, None, None]
self.std = torch.Tensor( STD ).float().cuda()[None, :, None, None]
self.transform = cfg.backbone.transform
def forward(self, img):
self.mean = self.mean.to(img.device)
self.std = self.std.to(img.device)
if cfg.preserve_aspect_ratio:
_, h, w, _ = img.size()
img_size = Resize.calc_size_preserve_ar(w, h, cfg.max_size)
img_size = (img_size[1], img_size[0])
else:
img_size = (cfg.max_size, cfg.max_size)
img = img.permute(0, 3, 1, 2).contiguous()
img = F.interpolate(img, img_size, mode='bilinear', align_corners=False)
if self.transform.normalize:
img = (img - self.mean) / self.std
elif self.transform.subtract_means:
img = (img - self.mean)
elif self.transform.to_float:
img = img / 255
if self.transform.channel_order != 'RGB':
raise NotImplementedError
img = img[:, (2, 1, 0), :, :].contiguous()
return img
def do_nothing(img=None, masks=None, boxes=None, labels=None):
return img, masks, boxes, labels
def enable_if(condition, obj):
return obj if condition else do_nothing
class SSDAugmentation(object):
""" Transform to be used when training. """
def __init__(self, mean=MEANS, std=STD):
self.augment = Compose([
ConvertFromInts(),
ToAbsoluteCoords(),
enable_if(cfg.augment_photometric_distort, PhotometricDistort()),
enable_if(cfg.augment_expand, Expand(mean)),
enable_if(cfg.augment_random_sample_crop, RandomSampleCrop()),
enable_if(cfg.augment_random_mirror, RandomMirror()),
enable_if(cfg.augment_random_flip, RandomFlip()),
enable_if(cfg.augment_random_flip, RandomRot90()),
Resize(),
enable_if(not cfg.preserve_aspect_ratio, Pad(cfg.max_size, cfg.max_size, mean)),
ToPercentCoords(),
PrepareMasks(cfg.mask_size, cfg.use_gt_bboxes),
BackboneTransform(cfg.backbone.transform, mean, std, 'BGR')
])
def __call__(self, img, masks, boxes, labels):
return self.augment(img, masks, boxes, labels)