|
import copy |
|
from pathlib import Path |
|
|
|
import cv2 |
|
import numpy as np |
|
import torch |
|
from torch import torch_version |
|
|
|
from r_facelib.detection.yolov5face.models.common import Conv |
|
from r_facelib.detection.yolov5face.models.yolo import Model |
|
from r_facelib.detection.yolov5face.utils.datasets import letterbox |
|
from r_facelib.detection.yolov5face.utils.general import ( |
|
check_img_size, |
|
non_max_suppression_face, |
|
scale_coords, |
|
scale_coords_landmarks, |
|
) |
|
|
|
print(f"Torch version: {torch.__version__}") |
|
IS_HIGH_VERSION = torch_version.__version__ >= "1.9.0" |
|
|
|
def isListempty(inList): |
|
if isinstance(inList, list): |
|
return all(map(isListempty, inList)) |
|
return False |
|
|
|
class YoloDetector: |
|
def __init__( |
|
self, |
|
config_name, |
|
min_face=10, |
|
target_size=None, |
|
device='cuda', |
|
): |
|
""" |
|
config_name: name of .yaml config with network configuration from models/ folder. |
|
min_face : minimal face size in pixels. |
|
target_size : target size of smaller image axis (choose lower for faster work). e.g. 480, 720, 1080. |
|
None for original resolution. |
|
""" |
|
self._class_path = Path(__file__).parent.absolute() |
|
self.target_size = target_size |
|
self.min_face = min_face |
|
self.detector = Model(cfg=config_name) |
|
self.device = device |
|
|
|
|
|
def _preprocess(self, imgs): |
|
""" |
|
Preprocessing image before passing through the network. Resize and conversion to torch tensor. |
|
""" |
|
pp_imgs = [] |
|
for img in imgs: |
|
h0, w0 = img.shape[:2] |
|
if self.target_size: |
|
r = self.target_size / min(h0, w0) |
|
if r < 1: |
|
img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=cv2.INTER_LINEAR) |
|
|
|
imgsz = check_img_size(max(img.shape[:2]), s=self.detector.stride.max()) |
|
img = letterbox(img, new_shape=imgsz)[0] |
|
pp_imgs.append(img) |
|
pp_imgs = np.array(pp_imgs) |
|
pp_imgs = pp_imgs.transpose(0, 3, 1, 2) |
|
pp_imgs = torch.from_numpy(pp_imgs).to(self.device) |
|
pp_imgs = pp_imgs.float() |
|
return pp_imgs / 255.0 |
|
|
|
def _postprocess(self, imgs, origimgs, pred, conf_thres, iou_thres): |
|
""" |
|
Postprocessing of raw pytorch model output. |
|
Returns: |
|
bboxes: list of arrays with 4 coordinates of bounding boxes with format x1,y1,x2,y2. |
|
points: list of arrays with coordinates of 5 facial keypoints (eyes, nose, lips corners). |
|
""" |
|
bboxes = [[] for _ in range(len(origimgs))] |
|
landmarks = [[] for _ in range(len(origimgs))] |
|
|
|
pred = non_max_suppression_face(pred, conf_thres, iou_thres) |
|
|
|
for image_id, origimg in enumerate(origimgs): |
|
img_shape = origimg.shape |
|
image_height, image_width = img_shape[:2] |
|
gn = torch.tensor(img_shape)[[1, 0, 1, 0]] |
|
gn_lks = torch.tensor(img_shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]] |
|
det = pred[image_id].cpu() |
|
scale_coords(imgs[image_id].shape[1:], det[:, :4], img_shape).round() |
|
scale_coords_landmarks(imgs[image_id].shape[1:], det[:, 5:15], img_shape).round() |
|
|
|
for j in range(det.size()[0]): |
|
box = (det[j, :4].view(1, 4) / gn).view(-1).tolist() |
|
box = list( |
|
map(int, [box[0] * image_width, box[1] * image_height, box[2] * image_width, box[3] * image_height]) |
|
) |
|
if box[3] - box[1] < self.min_face: |
|
continue |
|
lm = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist() |
|
lm = list(map(int, [i * image_width if j % 2 == 0 else i * image_height for j, i in enumerate(lm)])) |
|
lm = [lm[i : i + 2] for i in range(0, len(lm), 2)] |
|
bboxes[image_id].append(box) |
|
landmarks[image_id].append(lm) |
|
return bboxes, landmarks |
|
|
|
def detect_faces(self, imgs, conf_thres=0.7, iou_thres=0.5): |
|
""" |
|
Get bbox coordinates and keypoints of faces on original image. |
|
Params: |
|
imgs: image or list of images to detect faces on with BGR order (convert to RGB order for inference) |
|
conf_thres: confidence threshold for each prediction |
|
iou_thres: threshold for NMS (filter of intersecting bboxes) |
|
Returns: |
|
bboxes: list of arrays with 4 coordinates of bounding boxes with format x1,y1,x2,y2. |
|
points: list of arrays with coordinates of 5 facial keypoints (eyes, nose, lips corners). |
|
""" |
|
|
|
images = imgs if isinstance(imgs, list) else [imgs] |
|
images = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in images] |
|
origimgs = copy.deepcopy(images) |
|
|
|
images = self._preprocess(images) |
|
|
|
if IS_HIGH_VERSION: |
|
with torch.inference_mode(): |
|
pred = self.detector(images)[0] |
|
else: |
|
with torch.no_grad(): |
|
pred = self.detector(images)[0] |
|
|
|
bboxes, points = self._postprocess(images, origimgs, pred, conf_thres, iou_thres) |
|
|
|
|
|
if not isListempty(points): |
|
bboxes = np.array(bboxes).reshape(-1,4) |
|
points = np.array(points).reshape(-1,10) |
|
padding = bboxes[:,0].reshape(-1,1) |
|
return np.concatenate((bboxes, padding, points), axis=1) |
|
else: |
|
return None |
|
|
|
def __call__(self, *args): |
|
return self.predict(*args) |
|
|