517 lines
20 KiB
Python
517 lines
20 KiB
Python
|
import datetime
|
||
|
import os
|
||
|
import time
|
||
|
import ffmpeg
|
||
|
import torch
|
||
|
import cv2
|
||
|
import numpy as np
|
||
|
from multiprocessing import Process, Manager
|
||
|
from threading import Thread
|
||
|
from read_data import LoadImages, LoadStreams
|
||
|
import torch.backends.cudnn as cudnn
|
||
|
import torch.nn.functional as F
|
||
|
import torchvision
|
||
|
|
||
|
from PIL import Image, ImageDraw, ImageFont
|
||
|
|
||
|
class YOLO_Segment():
|
||
|
time_reference = datetime.datetime.now()
|
||
|
counter_frame = 0
|
||
|
processed_fps = 0
|
||
|
|
||
|
def __init__(self,video_path=None):
|
||
|
|
||
|
|
||
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||
|
|
||
|
|
||
|
self.model = torch.load('weight/segment/yolov5s-seg.pt', map_location=self.device)['model'].float().fuse()
|
||
|
self.classes = self.model.names
|
||
|
|
||
|
self.frame = [None]
|
||
|
|
||
|
if video_path is not None:
|
||
|
self.video_name = video_path
|
||
|
else:
|
||
|
self.video_name = 'vid2.mp4' # A default video file
|
||
|
|
||
|
|
||
|
self.dataset = LoadImages(self.video_name)
|
||
|
|
||
|
self.names = self.model.names
|
||
|
|
||
|
def use_webcam(self, source):
|
||
|
# self.dataset.release() # Release any existing video capture
|
||
|
# self.cap = cv2.VideoCapture(0) # Open default webcam
|
||
|
# print('use_webcam')
|
||
|
source = source
|
||
|
self.imgsz = 640
|
||
|
cudnn.benchmark = True
|
||
|
self.dataset = LoadStreams(source, img_size=self.imgsz)
|
||
|
|
||
|
def class_to_label(self, x):
|
||
|
return self.classes[int(x)]
|
||
|
|
||
|
def get_frame(self):
|
||
|
|
||
|
colors = Colors()
|
||
|
|
||
|
for im0s in self.dataset:
|
||
|
# print(self.dataset.mode)
|
||
|
# print(self.dataset)
|
||
|
if self.dataset.mode == 'stream':
|
||
|
image = im0s[0].copy()
|
||
|
else:
|
||
|
image = im0s.copy()
|
||
|
img = image[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
|
||
|
|
||
|
img0 = img.copy()
|
||
|
|
||
|
img = torch.tensor(img0)
|
||
|
|
||
|
img = img.float() # uint8 to fp16/32
|
||
|
img /= 255.0 # 0 - 255 to 0.0 - 1.0
|
||
|
if img.ndimension() == 3:
|
||
|
img = img.unsqueeze(0)
|
||
|
img = img.to(self.device)
|
||
|
self.model.to(self.device)
|
||
|
pred, proto = self.model(img)[:2]
|
||
|
|
||
|
|
||
|
pred = non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45, max_det=1000, nm=32)
|
||
|
|
||
|
for i, det in enumerate(pred): # per image
|
||
|
annotator = Annotator(image, line_width=3, example=str(self.names))
|
||
|
|
||
|
if len(det):
|
||
|
masks = process_mask(proto[i], det[:, 6:], det[:, :4], img.shape[2:], upsample=True) # HWC
|
||
|
det[:, :4] = scale_boxes(img.shape[2:], det[:, :4], img.shape).round() # rescale boxes to im0 size
|
||
|
segments = reversed(masks2segments(masks))
|
||
|
segments = [scale_segments(img.shape[2:], x, img.shape, normalize=True) for x in segments]
|
||
|
|
||
|
# Print results
|
||
|
txt = ""
|
||
|
for c in det[:, 5].unique():
|
||
|
n = (det[:, 5] == c).sum() # detections per class
|
||
|
txt += f"{n} {self.classes[int(c)]}{'s' * (n > 1)}, " # add to string
|
||
|
|
||
|
annotator.masks(masks,
|
||
|
colors=[colors(x, True) for x in det[:, 5]],
|
||
|
im_gpu=img[i])
|
||
|
|
||
|
im0 = annotator.result()
|
||
|
|
||
|
|
||
|
# Draw the number of people on the frame and display it
|
||
|
ret, jpeg = cv2.imencode(".jpg", im0)
|
||
|
|
||
|
return jpeg.tobytes(), txt
|
||
|
|
||
|
class Colors:
|
||
|
# Ultralytics color palette https://ultralytics.com/
|
||
|
def __init__(self):
|
||
|
# hex = matplotlib.colors.TABLEAU_COLORS.values()
|
||
|
hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB',
|
||
|
'2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
|
||
|
self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
|
||
|
self.n = len(self.palette)
|
||
|
|
||
|
def __call__(self, i, bgr=False):
|
||
|
c = self.palette[int(i) % self.n]
|
||
|
return (c[2], c[1], c[0]) if bgr else c
|
||
|
|
||
|
@staticmethod
|
||
|
def hex2rgb(h): # rgb order (PIL)
|
||
|
return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
|
||
|
|
||
|
class Annotator:
|
||
|
# YOLOv5 Annotator for train/val mosaics and jpgs and detect/hub inference annotations
|
||
|
def __init__(self, im, line_width=None, font_size=None, font='Arial.ttf', pil=False, example='abc'):
|
||
|
assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to Annotator() input images.'
|
||
|
non_ascii = not is_ascii(example) # non-latin labels, i.e. asian, arabic, cyrillic
|
||
|
self.pil = pil or non_ascii
|
||
|
if self.pil: # use PIL
|
||
|
self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
|
||
|
self.draw = ImageDraw.Draw(self.im)
|
||
|
self.font = 'Arial.Unicode.ttf'
|
||
|
else: # use cv2
|
||
|
self.im = im
|
||
|
self.lw = line_width or max(round(sum(im.shape) / 2 * 0.003), 2) # line width
|
||
|
|
||
|
|
||
|
def masks(self, masks, colors, im_gpu, alpha=0.5):
|
||
|
"""Plot masks at once.
|
||
|
Args:
|
||
|
masks (tensor): predicted masks on cuda, shape: [n, h, w]
|
||
|
colors (List[List[Int]]): colors for predicted masks, [[r, g, b] * n]
|
||
|
im_gpu (tensor): img is in cuda, shape: [3, h, w], range: [0, 1]
|
||
|
alpha (float): mask transparency: 0.0 fully transparent, 1.0 opaque
|
||
|
"""
|
||
|
if self.pil:
|
||
|
# convert to numpy first
|
||
|
self.im = np.asarray(self.im).copy()
|
||
|
if im_gpu is None:
|
||
|
# Add multiple masks of shape(h,w,n) with colors list([r,g,b], [r,g,b], ...)
|
||
|
if len(masks) == 0:
|
||
|
return
|
||
|
if isinstance(masks, torch.Tensor):
|
||
|
masks = torch.as_tensor(masks, dtype=torch.uint8)
|
||
|
masks = masks.permute(1, 2, 0).contiguous()
|
||
|
masks = masks.cpu().numpy()
|
||
|
# masks = np.ascontiguousarray(masks.transpose(1, 2, 0))
|
||
|
masks = scale_image(masks.shape[:2], masks, self.im.shape)
|
||
|
masks = np.asarray(masks, dtype=np.float32)
|
||
|
colors = np.asarray(colors, dtype=np.float32) # shape(n,3)
|
||
|
s = masks.sum(2, keepdims=True).clip(0, 1) # add all masks together
|
||
|
masks = (masks @ colors).clip(0, 255) # (h,w,n) @ (n,3) = (h,w,3)
|
||
|
self.im[:] = masks * alpha + self.im * (1 - s * alpha)
|
||
|
else:
|
||
|
if len(masks) == 0:
|
||
|
self.im[:] = im_gpu.permute(1, 2, 0).contiguous().cpu().numpy() * 255
|
||
|
colors = torch.tensor(colors, device=im_gpu.device, dtype=torch.float32) / 255.0
|
||
|
colors = colors[:, None, None] # shape(n,1,1,3)
|
||
|
masks = masks.unsqueeze(3) # shape(n,h,w,1)
|
||
|
masks_color = masks * (colors * alpha) # shape(n,h,w,3)
|
||
|
|
||
|
inv_alph_masks = (1 - masks * alpha).cumprod(0) # shape(n,h,w,1)
|
||
|
mcs = (masks_color * inv_alph_masks).sum(0) * 2 # mask color summand shape(n,h,w,3)
|
||
|
|
||
|
im_gpu = im_gpu.flip(dims=[0]) # flip channel
|
||
|
im_gpu = im_gpu.permute(1, 2, 0).contiguous() # shape(h,w,3)
|
||
|
im_gpu = im_gpu * inv_alph_masks[-1] + mcs
|
||
|
im_mask = (im_gpu * 255).byte().cpu().numpy()
|
||
|
# print(type(im_gpu), type(im_mask), type(self.im.shape))
|
||
|
self.im[:] = scale_image(im_gpu.shape, im_mask, self.im.shape)
|
||
|
if self.pil:
|
||
|
# convert im back to PIL and update draw
|
||
|
self.fromarray(self.im)
|
||
|
|
||
|
|
||
|
def rectangle(self, xy, fill=None, outline=None, width=1):
|
||
|
# Add rectangle to image (PIL-only)
|
||
|
self.draw.rectangle(xy, fill, outline, width)
|
||
|
|
||
|
def text(self, xy, text, txt_color=(255, 255, 255), anchor='top'):
|
||
|
# Add text to image (PIL-only)
|
||
|
if anchor == 'bottom': # start y from font bottom
|
||
|
w, h = self.font.getsize(text) # text width, height
|
||
|
xy[1] += 1 - h
|
||
|
self.draw.text(xy, text, fill=txt_color, font=self.font)
|
||
|
|
||
|
def fromarray(self, im):
|
||
|
# Update self.im from a numpy array
|
||
|
self.im = im if isinstance(im, Image.Image) else Image.fromarray(im)
|
||
|
self.draw = ImageDraw.Draw(self.im)
|
||
|
|
||
|
def result(self):
|
||
|
# Return annotated image as array
|
||
|
return np.asarray(self.im)
|
||
|
|
||
|
|
||
|
|
||
|
def time_synchronized():
|
||
|
# pytorch-accurate time
|
||
|
if torch.cuda.is_available():
|
||
|
torch.cuda.synchronize()
|
||
|
return time.time()
|
||
|
|
||
|
def is_ascii(s=''):
|
||
|
# Is string composed of all ASCII (no UTF) characters? (note str().isascii() introduced in python 3.7)
|
||
|
s = str(s) # convert list, tuple, None, etc. to str
|
||
|
return len(s.encode().decode('ascii', 'ignore')) == len(s)
|
||
|
|
||
|
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
|
||
|
# Rescale boxes (xyxy) from img1_shape to img0_shape
|
||
|
if ratio_pad is None: # calculate from img0_shape
|
||
|
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
||
|
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
|
||
|
else:
|
||
|
gain = ratio_pad[0][0]
|
||
|
pad = ratio_pad[1]
|
||
|
|
||
|
boxes[..., [0, 2]] -= pad[0] # x padding
|
||
|
boxes[..., [1, 3]] -= pad[1] # y padding
|
||
|
boxes[..., :4] /= gain
|
||
|
clip_boxes(boxes, img0_shape)
|
||
|
return boxes
|
||
|
|
||
|
def scale_segments(img1_shape, segments, img0_shape, ratio_pad=None, normalize=False):
|
||
|
# Rescale coords (xyxy) from img1_shape to img0_shape
|
||
|
if ratio_pad is None: # calculate from img0_shape
|
||
|
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
|
||
|
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
|
||
|
else:
|
||
|
gain = ratio_pad[0][0]
|
||
|
pad = ratio_pad[1]
|
||
|
|
||
|
segments[:, 0] -= pad[0] # x padding
|
||
|
segments[:, 1] -= pad[1] # y padding
|
||
|
segments /= gain
|
||
|
clip_segments(segments, img0_shape)
|
||
|
if normalize:
|
||
|
segments[:, 0] /= img0_shape[1] # width
|
||
|
segments[:, 1] /= img0_shape[0] # height
|
||
|
return segments
|
||
|
|
||
|
def clip_boxes(boxes, shape):
|
||
|
# Clip boxes (xyxy) to image shape (height, width)
|
||
|
if isinstance(boxes, torch.Tensor): # faster individually
|
||
|
boxes[..., 0].clamp_(0, shape[1]) # x1
|
||
|
boxes[..., 1].clamp_(0, shape[0]) # y1
|
||
|
boxes[..., 2].clamp_(0, shape[1]) # x2
|
||
|
boxes[..., 3].clamp_(0, shape[0]) # y2
|
||
|
else: # np.array (faster grouped)
|
||
|
boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1]) # x1, x2
|
||
|
boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0]) # y1, y2
|
||
|
|
||
|
|
||
|
def clip_segments(segments, shape):
|
||
|
# Clip segments (xy1,xy2,...) to image shape (height, width)
|
||
|
if isinstance(segments, torch.Tensor): # faster individually
|
||
|
segments[:, 0].clamp_(0, shape[1]) # x
|
||
|
segments[:, 1].clamp_(0, shape[0]) # y
|
||
|
else: # np.array (faster grouped)
|
||
|
segments[:, 0] = segments[:, 0].clip(0, shape[1]) # x
|
||
|
segments[:, 1] = segments[:, 1].clip(0, shape[0]) # y
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def masks2segments(masks, strategy='largest'):
|
||
|
# Convert masks(n,160,160) into segments(n,xy)
|
||
|
segments = []
|
||
|
for x in masks.int().cpu().numpy().astype('uint8'):
|
||
|
c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
|
||
|
if c:
|
||
|
if strategy == 'concat': # concatenate all segments
|
||
|
c = np.concatenate([x.reshape(-1, 2) for x in c])
|
||
|
elif strategy == 'largest': # select largest segment
|
||
|
c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
|
||
|
else:
|
||
|
c = np.zeros((0, 2)) # no segments found
|
||
|
segments.append(c.astype('float32'))
|
||
|
return segments
|
||
|
|
||
|
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
|
||
|
"""
|
||
|
Crop before upsample.
|
||
|
proto_out: [mask_dim, mask_h, mask_w]
|
||
|
out_masks: [n, mask_dim], n is number of masks after nms
|
||
|
bboxes: [n, 4], n is number of masks after nms
|
||
|
shape:input_image_size, (h, w)
|
||
|
|
||
|
return: h, w, n
|
||
|
"""
|
||
|
|
||
|
c, mh, mw = protos.shape # CHW
|
||
|
ih, iw = shape
|
||
|
# print(masks_in.shape, protos.shape)
|
||
|
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
|
||
|
|
||
|
downsampled_bboxes = bboxes.clone()
|
||
|
downsampled_bboxes[:, 0] *= mw / iw
|
||
|
downsampled_bboxes[:, 2] *= mw / iw
|
||
|
downsampled_bboxes[:, 3] *= mh / ih
|
||
|
downsampled_bboxes[:, 1] *= mh / ih
|
||
|
|
||
|
masks = crop_mask(masks, downsampled_bboxes) # CHW
|
||
|
if upsample:
|
||
|
masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0] # CHW
|
||
|
return masks.gt_(0.5)
|
||
|
|
||
|
|
||
|
def crop_mask(masks, boxes):
|
||
|
"""
|
||
|
"Crop" predicted masks by zeroing out everything not in the predicted bbox.
|
||
|
Vectorized by Chong (thanks Chong).
|
||
|
|
||
|
Args:
|
||
|
- masks should be a size [h, w, n] tensor of masks
|
||
|
- boxes should be a size [n, 4] tensor of bbox coords in relative point form
|
||
|
"""
|
||
|
|
||
|
n, h, w = masks.shape
|
||
|
x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1) # x1 shape(1,1,n)
|
||
|
r = torch.arange(w, device=masks.device, dtype=x1.dtype)[None, None, :] # rows shape(1,w,1)
|
||
|
c = torch.arange(h, device=masks.device, dtype=x1.dtype)[None, :, None] # cols shape(h,1,1)
|
||
|
|
||
|
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def scale_image(im1_shape, masks, im0_shape, ratio_pad=None):
|
||
|
"""
|
||
|
img1_shape: model input shape, [h, w]
|
||
|
img0_shape: origin pic shape, [h, w, 3]
|
||
|
masks: [h, w, num]
|
||
|
"""
|
||
|
# Rescale coordinates (xyxy) from im1_shape to im0_shape
|
||
|
if ratio_pad is None: # calculate from im0_shape
|
||
|
gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new
|
||
|
pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding
|
||
|
else:
|
||
|
pad = ratio_pad[1]
|
||
|
top, left = int(pad[1]), int(pad[0]) # y, x
|
||
|
bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0])
|
||
|
|
||
|
if len(masks.shape) < 2:
|
||
|
raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
|
||
|
masks = masks[top:bottom, left:right]
|
||
|
# masks = masks.permute(2, 0, 1).contiguous()
|
||
|
# masks = F.interpolate(masks[None], im0_shape[:2], mode='bilinear', align_corners=False)[0]
|
||
|
# masks = masks.permute(1, 2, 0).contiguous()
|
||
|
|
||
|
masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]))
|
||
|
|
||
|
if len(masks.shape) == 2:
|
||
|
masks = masks[:, :, None]
|
||
|
return masks
|
||
|
|
||
|
def xywh2xyxy(x):
|
||
|
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
|
||
|
y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
|
||
|
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
|
||
|
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
|
||
|
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
|
||
|
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
|
||
|
return y
|
||
|
|
||
|
|
||
|
def non_max_suppression(
|
||
|
prediction,
|
||
|
conf_thres=0.25,
|
||
|
iou_thres=0.45,
|
||
|
classes=None,
|
||
|
agnostic=False,
|
||
|
multi_label=False,
|
||
|
labels=(),
|
||
|
max_det=300,
|
||
|
nm=0, # number of masks
|
||
|
):
|
||
|
"""Non-Maximum Suppression (NMS) on inference results to reject overlapping detections
|
||
|
|
||
|
Returns:
|
||
|
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
|
||
|
"""
|
||
|
|
||
|
if isinstance(prediction, (list, tuple)): # YOLOv5 model in validation model, output = (inference_out, loss_out)
|
||
|
prediction = prediction[0] # select only inference output
|
||
|
|
||
|
device = prediction.device
|
||
|
mps = 'mps' in device.type # Apple MPS
|
||
|
if mps: # MPS not fully supported yet, convert tensors to CPU labelme_dataset NMS
|
||
|
prediction = prediction.cpu()
|
||
|
bs = prediction.shape[0] # batch size
|
||
|
nc = prediction.shape[2] - nm - 5 # number of classes
|
||
|
xc = prediction[..., 4] > conf_thres # candidates
|
||
|
|
||
|
# Checks
|
||
|
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
|
||
|
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
|
||
|
|
||
|
# Settings
|
||
|
# min_wh = 2 # (pixels) minimum box width and height
|
||
|
max_wh = 7680 # (pixels) maximum box width and height
|
||
|
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
|
||
|
time_limit = 0.5 + 0.05 * bs # seconds to quit after
|
||
|
redundant = True # require redundant detections
|
||
|
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
|
||
|
merge = False # use merge-NMS
|
||
|
|
||
|
t = time.time()
|
||
|
mi = 5 + nc # mask start index
|
||
|
output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
|
||
|
for xi, x in enumerate(prediction): # image index, image inference
|
||
|
# Apply constraints
|
||
|
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
|
||
|
x = x[xc[xi]] # confidence
|
||
|
|
||
|
# Cat apriori labels if autolabelling
|
||
|
if labels and len(labels[xi]):
|
||
|
lb = labels[xi]
|
||
|
v = torch.zeros((len(lb), nc + nm + 5), device=x.device)
|
||
|
v[:, :4] = lb[:, 1:5] # box
|
||
|
v[:, 4] = 1.0 # conf
|
||
|
v[range(len(lb)), lb[:, 0].long() + 5] = 1.0 # cls
|
||
|
x = torch.cat((x, v), 0)
|
||
|
|
||
|
# If none remain process next image
|
||
|
if not x.shape[0]:
|
||
|
continue
|
||
|
|
||
|
# Compute conf
|
||
|
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
|
||
|
|
||
|
# Box/Mask
|
||
|
box = xywh2xyxy(x[:, :4]) # center_x, center_y, width, height) to (x1, y1, x2, y2)
|
||
|
mask = x[:, mi:] # zero columns if no masks
|
||
|
|
||
|
# Detections matrix nx6 (xyxy, conf, cls)
|
||
|
if multi_label:
|
||
|
i, j = (x[:, 5:mi] > conf_thres).nonzero(as_tuple=False).T
|
||
|
x = torch.cat((box[i], x[i, 5 + j, None], j[:, None].float(), mask[i]), 1)
|
||
|
else: # best class only
|
||
|
conf, j = x[:, 5:mi].max(1, keepdim=True)
|
||
|
x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]
|
||
|
|
||
|
# Filter by class
|
||
|
if classes is not None:
|
||
|
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
|
||
|
|
||
|
# Apply finite constraint
|
||
|
# if not torch.isfinite(x).all():
|
||
|
# x = x[torch.isfinite(x).all(1)]
|
||
|
|
||
|
# Check shape
|
||
|
n = x.shape[0] # number of boxes
|
||
|
if not n: # no boxes
|
||
|
continue
|
||
|
elif n > max_nms: # excess boxes
|
||
|
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
|
||
|
else:
|
||
|
x = x[x[:, 4].argsort(descending=True)] # sort by confidence
|
||
|
|
||
|
# Batched NMS
|
||
|
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
|
||
|
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
|
||
|
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
|
||
|
if i.shape[0] > max_det: # limit detections
|
||
|
i = i[:max_det]
|
||
|
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
|
||
|
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
|
||
|
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
|
||
|
weights = iou * scores[None] # box weights
|
||
|
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
|
||
|
if redundant:
|
||
|
i = i[iou.sum(1) > 1] # require redundancy
|
||
|
|
||
|
output[xi] = x[i]
|
||
|
if mps:
|
||
|
output[xi] = output[xi].to(device)
|
||
|
|
||
|
|
||
|
return output
|
||
|
|
||
|
|
||
|
def box_iou(box1, box2, eps=1e-7):
|
||
|
# https://github.com/pytorch/vision/blob/master/torchvision/ops/boxes.py
|
||
|
"""
|
||
|
Return intersection-over-union (Jaccard index) of boxes.
|
||
|
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
|
||
|
Arguments:
|
||
|
box1 (Tensor[N, 4])
|
||
|
box2 (Tensor[M, 4])
|
||
|
Returns:
|
||
|
iou (Tensor[N, M]): the NxM matrix containing the pairwise
|
||
|
IoU values for every element in boxes1 and boxes2
|
||
|
"""
|
||
|
|
||
|
# inter(N,M) = (rb(N,M,2) - lt(N,M,2)).clamp(0).prod(2)
|
||
|
(a1, a2), (b1, b2) = box1.unsqueeze(1).chunk(2, 2), box2.unsqueeze(0).chunk(2, 2)
|
||
|
inter = (torch.min(a2, b2) - torch.max(a1, b1)).clamp(0).prod(2)
|
||
|
|
||
|
# IoU = inter / (area1 + area2 - inter)
|
||
|
return inter / ((a2 - a1).prod(2) + (b2 - b1).prod(2) - inter + eps)
|