import sys sys.path.append('pytorch-yolo-v3') from preprocess import letterbox_image import torch import torch.nn as nn from torch.autograd import Variable import cv2 from util import * import os import os.path as osp from darknet import Darknet import random import itertools import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) class AnimeHeadDetector: def __init__(self, cfgfile, weightsfile): self.CONFIDENCE_THRESHOLD = 0.85 self.NMS_THRESHOLD = 0.4 self.NUM_CLASSES = 2 # hard code here for head detector self.CLASSES = [ 'Head' ] self.CUDA = torch.cuda.is_available() if self.CUDA: logging.info('Using CUDA.') else: logging.info('Using CPU.') logging.info("Loading network.....") self.model = Darknet(cfgfile) self.model.load_weights(weightsfile) self.model.net_info["height"] = 512 # hard code here because we didn't use Spp self.inp_dim = int(self.model.net_info["height"]) if self.CUDA: self.model.cuda() self.model.eval() logging.info("Network successfully loaded.") # Detect the heads in the given image (opencv numpy array), and return the results def detect(self, image): # Preprocess the image w, h = image.shape[1], image.shape[0] img = (letterbox_image(image, (self.inp_dim, self.inp_dim))) img_ = img[:,:,::-1].transpose((2,0,1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) im_dim_list = torch.FloatTensor([ [w, h] ]).repeat(1,2) # Send to the model for prediction if self.CUDA: img_ = img_.cuda() with torch.no_grad(): prediction = self.model(Variable(img_), self.CUDA) output = write_results(prediction, self.CONFIDENCE_THRESHOLD, self.NUM_CLASSES, nms=True, nms_conf=self.NMS_THRESHOLD) # This function does NMS and converts the format. Returns 0 if no results are found. # output has the format of [ class, cx, cy, w, h, confidence ] if type(output) == int: return None # Convert back to the coordinates in the original image before resizing # We need somewhat complicated processing here because letter boxing was used in preprocessing output = output.detach().cpu() scaling_factor = torch.min(self.inp_dim/im_dim_list,1)[0].view(-1,1) output[:,[1,3]] -= (self.inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 output[:,[2,4]] -= (self.inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 output[:,1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[0,0]) output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[0,1]) return [ { 'l': int(output[i, 1].item()), 't': int(output[i, 2].item()), 'r': int(output[i, 3].item()), 'b': int(output[i, 4].item()), 'confidence': output[i, 5].item() } for i in range(output.shape[0]) ] # Detect heads and return the cropped faces as a list of opencv numpy arrays def detectAndCrop(self, image): results = self.detect(image) return [ image[result['t']: result['b'] + 1, result['l']: result['r'] + 1, :] for result in results ] # Detect heads and return the visualized face as a opencv numpy array def detectAndVisualize(self, image): results = self.detect(image) for result in results: cv2.rectangle(image, (result['l'], result['t']), (result['r'], result['b']), (0, 255, 0), 5) return image