"""Script to spot cat faces in videos and draw bounding boxes around them. Expects file 'model.best.tar' to exist (generated by train.py). Writes outputs to outputs/videos/ by default.""" from __future__ import print_function, division import argparse import numpy as np import os from collections import defaultdict from scipy import misc from model import Model2 from common import to_aspect_ratio_add, draw_heatmap, imresize_sidelen from bbs import RectangleOnImage import cv2 import torch from torch.autograd import Variable from skimage import morphology #from sklearn.cluster import DBSCAN import imgaug as ia import time torch.backends.cudnn.benchmark = True WRITE_TO_BASEDIR = "outputs/videos/" GPU = 0 def main(): """Find bounding boxes in a video.""" parser = argparse.ArgumentParser(description="Process a video") parser.add_argument("--video", help="Filepath to the video", required=True) parser.add_argument("--out_dir", help="Directory name in which to save results") parser.add_argument("--start_frame", default=1, type=int, help="Frame number to start at (1 to N).") parser.add_argument("--end_frame", help="Frame number to end at (1 to N).") parser.add_argument("--conf", default=0.5, type=float, help="Confidence threshold for BBs") parser.add_argument("--size", default=400, type=int, help="Input image size when feeding into the model") args = parser.parse_args() # load trained model checkpoint = torch.load("model.best.tar") model = Model2() model.load_state_dict(checkpoint["state_dict"]) if GPU >= 0: model.cuda(GPU) model.eval() del checkpoint # check if video file exists video_fp = args.video assert os.path.isfile(video_fp) # convert video filename to output directory path video_fn = os.path.basename(video_fp) if args.out_dir is not None: write_to_dir = os.path.join(WRITE_TO_BASEDIR, args.out_dir) else: write_to_dir = os.path.join(WRITE_TO_BASEDIR, os.path.splitext(video_fn)[0]) # create output directory if necessary if not os.path.exists(write_to_dir): os.makedirs(write_to_dir) # start reading video vidcap = cv2.VideoCapture(video_fp) success, img = vidcap.read() # img => float 0-255 BGR frame_idx = 0 # forward video, if --start_frame=<int> was used if int(args.start_frame) > 1: while frame_idx+1 < int(args.start_frame): success, img = vidcap.read() # img => float 0-255 BGR frame_idx += 1 # handle frames of video while success: # end if args.end_frame=<int> was used and that frame was reached if args.end_frame is not None and frame_idx >= args.end_frame: break # find BBs in frame time_start = time.time() img_rgb = img[:, :, ::-1] time_model = process_frame(frame_idx, img_rgb, model, write_to_dir, args.conf, input_size=args.size) #debug_frame(frame_idx, img_rgb, model, args.conf, input_size=args.size) time_req = time.time() - time_start # output message and forward to next frame print("Frame %05d in %03dms (model: %03dms)." % (frame_idx, time_req*1000, time_model*1000)) success, img = vidcap.read() frame_idx += 1 def process_frame(frame_idx, img, model, write_to_dir, conf_threshold, input_size=224): """Finds bounding boxes in a video frame, draws these bounding boxes and saves the result to HDD. """ # find BBs in frame bbs, time_model = find_bbs(img, model, conf_threshold, input_size=input_size) # draw BBs img_out = np.copy(img) for (bb, score) in bbs: if score > conf_threshold and bb.width > 2 and bb.height > 2: img_out = bb.draw_on_image(img_out, color=[0, 255, 0], thickness=3) # save to output directory save_to_fp = os.path.join(write_to_dir, "%05d.jpg" % (frame_idx,)) misc.imsave(save_to_fp, img_out) return time_model def find_bbs(img, model, conf_threshold, input_size): """Find bounding boxes in an image.""" # pad image so that its square img_pad, (pad_top, pad_right, pad_bottom, pad_left) = to_aspect_ratio_add(img, 1.0, return_paddings=True) # resize padded image to desired input size # "linear" interpolation seems to be enough here for 400x400 or larger images # change to "area" or "cubic" for marginally better quality img_rs = ia.imresize_single_image(img_pad, (input_size, input_size), interpolation="linear") # convert to torch-ready input variable inputs_np = (np.array([img_rs])/255.0).astype(np.float32).transpose(0, 3, 1, 2) inputs = torch.from_numpy(inputs_np) inputs = Variable(inputs, volatile=True) if GPU >= 0: inputs = inputs.cuda(GPU) # apply model and measure the model's time time_start = time.time() outputs_pred = model(inputs) time_req = time.time() - time_start # process the model's output (i.e. convert heatmaps to BBs) result = ModelResult( outputs_pred, inputs_np, img, (pad_top, pad_right, pad_bottom, pad_left) ) bbs = result.get_bbs() return bbs, time_req class ModelResult(object): """Class the handles the transformation from heatmaps (model output) to bounding boxes.""" def __init__(self, outputs, inputs_np, img, paddings): self.inputs = inputs_np self.outputs = outputs.cpu().data.numpy() assert self.inputs.ndim == 4 assert self.outputs.ndim == 4 assert self.inputs.shape[0] == 1 assert self.outputs.shape[0] == 1 self.img = img self.paddings = paddings self.shrink_depth = 1 self.shrink_threshold = 0.9 self.heatmap_activation_threshold = 0.25 def get_bbs(self): """Convert model outputs to bounding boxes.""" outputs_pred = self.outputs # generate shape of model input image # (=> original image + padding to square it + resize) # note: self.inputs has form NCHW img_pad_rs_shape = ( self.inputs.shape[2], self.inputs.shape[3], 3 ) # generate shape of original image after padding (no resize) img_pad_shape = ( self.img.shape[0] + self.paddings[0] + self.paddings[2], self.img.shape[1] + self.paddings[1] + self.paddings[3], 3 ) # convert heatmaps to rectangles # (old code that uses all heatmaps) """ hm_idx_to_rects = [] for i in range(outputs_pred.shape[1]): hms = self._heatmap_to_rects( outputs_pred[0, i, ...], img_pad_rs_shape ) hms_rev = self._rects_reverse_projection( hms, self.img.shape, img_pad_shape, self.paddings[0], self.paddings[1], self.paddings[2], self.paddings[3] ) hm_idx_to_rects.append(hms_rev) bbs = merge_rects_to_bbs(hm_idx_to_rects, conf_threshold) """ # convert only the first heatmap outputs to BBs # the other heatmaps (top left corner, top center, ...) # are currently ignored hm_idx_to_rects = [] for i in [0]: hms = self._heatmap_to_rects( outputs_pred[0, i, ...], img_pad_rs_shape ) hms_rev = self._rects_reverse_projection( hms, self.img.shape, img_pad_shape, self.paddings[0], self.paddings[1], self.paddings[2], self.paddings[3] ) hm_idx_to_rects.append(hms_rev) bbs = hm_idx_to_rects[0] return bbs def _heatmap_to_rects(self, grid_pred, bb_img): """Convert a heatmap to rectangles / bounding box candidates.""" grid_pred = np.squeeze(grid_pred) # (1, H, W) => (H, W) # remove low activations grid_thresh = grid_pred >= self.heatmap_activation_threshold # find connected components grid_labeled, num_labels = morphology.label( grid_thresh, background=0, connectivity=1, return_num=True ) # for each connected components, # - draw a bounding box around it, # - shrink the bounding box to optimal size # - estimate a score/confidence value bbs = [] for label in range(1, num_labels+1): (yy, xx) = np.nonzero(grid_labeled == label) min_y, max_y = np.min(yy), np.max(yy) min_x, max_x = np.min(xx), np.max(xx) rect = RectangleOnImage(x1=min_x, x2=max_x+1, y1=min_y, y2=max_y+1, shape=grid_labeled) activation = self._rect_to_score(rect, grid_pred) rect_shrunk, activation_shrunk = self._shrink(grid_pred, rect) rect_rs_shrunk = rect_shrunk.on(bb_img) bbs.append((rect_rs_shrunk, activation_shrunk)) return bbs def _shrink(self, heatmap, rect): """Shrink a rectangle to get rid of some low activations. The model often generates areas of high activations, with a few pixels of medium activations on the side. When drawing a bounding box around these activations, the medium ones can force the bounding box to become significantly larger than it should be. This function tries to shrink those bounding boxes, while retaining most of the activation. This function is implemented in a (slow) recursive way. Using dynamic programming would probably be faster. """ assert rect.width >= 1 and rect.height >= 1 #print("shrink...", rect) score_orig = self._rect_to_score(rect, heatmap) candidates = self._shrink_candidates(rect, depth=self.shrink_depth) candidates_scored = [] #print("score..") for candidate in candidates: score = self._rect_to_score(candidate, heatmap) score_rel = score / score_orig if score_rel >= self.shrink_threshold: candidates_scored.append((candidate, score, candidate.area)) #print("sort ", len(candidates_scored)) candidates_scored = sorted(candidates_scored, key=lambda t: t[2]) return (candidates_scored[0][0], candidates_scored[0][1]) def _shrink_candidates(self, rect, depth): """Recursive function called by _shrink() to generate bounding box candidates that are smaller than the input bounding box.""" result = [rect] if depth > 0: if rect.width > 1: rect_left = rect.copy(x1=rect.x1+1) rect_right = rect.copy(x2=rect.x2-1) result.extend(self._shrink_candidates(rect_left, depth=depth-1)) result.extend(self._shrink_candidates(rect_right, depth=depth-1)) if rect.height > 1: rect_top = rect.copy(y1=rect.y1+1) rect_bottom = rect.copy(y2=rect.y2-1) result.extend(self._shrink_candidates(rect_top, depth=depth-1)) result.extend(self._shrink_candidates(rect_bottom, depth=depth-1)) return result def _rects_reverse_projection(self, rects, img_shape, img_pad_shape, pad_top, pad_right, pad_bottom, pad_left): """Input images into the model are padded to make them squared. They are also resized to a smaller size. This function is supposed to remove both effects, i.e. to project the found bounding boxes from the padded and resized image to the unpadded und unresized (original) input image. """ result = [] for (rect, score) in rects: # project from resized padded (squared) image to unresized one rect_large = rect.on(img_pad_shape) # move rectangles to remove paddings rect_large_unpadded = rect_large.shift(top=-pad_top, left=-pad_left) # positions of corners are now correct, so switch underlying shape rect_large_unpadded = rect_large_unpadded.copy(shape=img_shape) result.append((rect_large_unpadded, score)) return result def _rect_to_score(self, rect, heatmap): """Compute a score for a given rectangle (i.e. the confidence value). Currently this is done via an average of the corresponding activations in the heatmap. """ subheatmap = rect.extract_from_image(heatmap) if subheatmap.ndim == 2 and subheatmap.shape[0] > 0 and subheatmap.shape[1] > 0: return np.average(subheatmap) else: print("[WARN] Broken heatmap extracted for rectangle", rect) return 0 # The following stuff is some old code to make use of all generated # heatmaps. Didn't work well in tests. """ def _merge_rects_to_bbs(self, hm_idx_to_rects, conf_threshold, img_shape): rects_full_size = self._make_rects_full_size(hm_idx_to_rects, self.img.shape) groups = self._group_rects(rects_full_size) final_bbs = [] #for label, rects in cluster.iteritems(): for label in groups: rects = groups[label] score_avg = sum([score for (rect, score) in rects]) / (1+9) if score_avg > conf_threshold: x1 = np.average([rect.x1 for (rect, score) in rects]) x2 = np.average([rect.x2 for (rect, score) in rects]) y1 = np.average([rect.y1 for (rect, score) in rects]) y2 = np.average([rect.y2 for (rect, score) in rects]) final_bbs.append((RectangleOnImage(x1=x1, y1=y1, x2=x2, y2=y2, shape=rect.shape), score_avg)) return final_bbs def _make_rects_full_size(self, hm_idx_to_rects, img_orig_shape, keep_grouping=False): rects_full_size = [] if keep_grouping: group = [] rects_full_size.append(group) else: group = rects_full_size for (rect, score) in hm_idx_to_rects[0]: group.append((rect, score)) nb_cells_y = 3 nb_cells_x = 3 grid_idx = 1 for row_idx in range(nb_cells_y): for col_idx in range(nb_cells_x): if keep_grouping: group = [] rects_full_size.append(group) else: group = rects_full_size left = col_idx right = nb_cells_x - col_idx - 1 above = row_idx below = nb_cells_y - row_idx - 1 for (rect, score) in hm_idx_to_rects[grid_idx]: x1 = rect.x1 - (left * rect.width) x2 = rect.x2 + (right * rect.width) y1 = rect.y1 - (above * rect.height) y2 = rect.y2 + (below * rect.height) rect_full_size = RectangleOnImage(x1=x1, x2=x2, y1=y1, y2=y2, shape=img_orig_shape) group.append((rect_full_size, score)) grid_idx += 1 return rects_full_size def _group_rects(self, rects_full_size): if len(rects_full_size) == 0: return dict() elif len(rects_full_size) == 1: return dict([(0, [rects_full_size])]) else: distances = np.zeros((len(rects_full_size), len(rects_full_size)), dtype=np.float32) for i in range(len(rects_full_size)): rect1 = rects_full_size[i][0] for j in range(i+1, len(rects_full_size)): rect2 = rects_full_size[j][0] sim = rect1.iou(rect2) distances[i, j] = (1 - sim) distances[j, i] = (1 - sim) clusterer = DBSCAN(metric="precomputed") labels = clusterer.fit_predict(distances) clusters = defaultdict(list) for label, (rect, score) in zip(labels, rects_full_size): clusters[label].append((rect, score)) return clusters """ def debug_frame(frame_idx, img, model, conf_threshold, input_size=224): """Corresponding function to process_frame() that effectively does the same, but shows some debug information. Probably doesn't work currently as some functions were moved into a class. """ img_orig_shape = img.shape img_pad, (pad_top, pad_right, pad_bottom, pad_left) = to_aspect_ratio_add(img, 1.0, return_paddings=True) img_rs = misc.imresize(img_pad, (input_size, input_size)) inputs = (np.array([img_rs])/255.0).astype(np.float32).transpose(0, 3, 1, 2) inputs = torch.from_numpy(inputs) inputs = Variable(inputs) if GPU >= 0: inputs = inputs.cuda(GPU) outputs_pred = model(inputs) outputs_pred = outputs_pred.data.cpu().numpy() print("outputs_pred", np.min(outputs_pred), np.average(outputs_pred), np.max(outputs_pred)) hm_idx_to_rects_pad = [] for i in range(outputs_pred.shape[1]): hms = heatmap_to_rects(outputs_pred[0, i, ...], img_rs) hm_idx_to_rects_pad.append(hms) hm_idx_to_rects = [] for i in range(outputs_pred.shape[1]): hms = heatmap_to_rects(outputs_pred[0, i, ...], img_rs) hms_rev = rects_reverse_projection(hms, img.shape, img_pad.shape, pad_top, pad_right, pad_bottom, pad_left) hm_idx_to_rects.append(hms_rev) rects_full_size_vis = make_rects_full_size(hm_idx_to_rects, img_orig_shape, keep_grouping=True) rects_full_size = make_rects_full_size(hm_idx_to_rects, img_orig_shape, keep_grouping=False) groups = group_rects(rects_full_size) final_bbs = [] #for label, rects in cluster.iteritems(): for label in groups: rects = groups[label] score_avg = sum([score for (rect, score) in rects]) / (1+9) if score_avg > conf_threshold: x1 = np.average([rect.x1 for (rect, score) in rects]) x2 = np.average([rect.x2 for (rect, score) in rects]) y1 = np.average([rect.y1 for (rect, score) in rects]) y2 = np.average([rect.y2 for (rect, score) in rects]) final_bbs.append((RectangleOnImage(x1=x1, y1=y1, x2=x2, y2=y2, shape=img_orig_shape), score_avg)) img_rs_nopad = imresize_sidelen(img, 200, pick_func=max) rows = [] # heatmaps row = [misc.imresize(img_rs, (img_rs_nopad.shape[0], img_rs_nopad.shape[1]))] for i in range(outputs_pred.shape[1]): hm = draw_heatmap(img_rs, outputs_pred[0, i]) row.append(misc.imresize(hm, (img_rs_nopad.shape[0], img_rs_nopad.shape[1]))) rows.append(np.hstack(row)) # heatmaps => rects (padded image) #print("pad", pad_top, pad_right, pad_bottom, pad_left) #print("hm_idx_to_rects_pad", hm_idx_to_rects_pad) row = [misc.imresize(img_rs, (img_rs_nopad.shape[0], img_rs_nopad.shape[1]))] for rects in hm_idx_to_rects_pad: img_cp = np.copy(row[0]) for (rect, score) in rects: img_cp = rect.draw_on_image(img_cp, color=[0, 255, 0]) row.append(img_cp) rows.append(np.hstack(row)) # heatmaps => rects (unpadded/original image) #print("hm_idx_to_rects", [[(r.on(img_rs_nopad), s) for (r, s) in rects] for rects in hm_idx_to_rects]) row = [img_rs_nopad] for rects in hm_idx_to_rects: img_cp = np.copy(img_rs_nopad) for (rect, score) in rects: img_cp = rect.draw_on_image(img_cp, color=[0, 255, 0]) row.append(img_cp) rows.append(np.hstack(row)) # heatmaps => rects full size row = [img_rs_nopad] for rects in rects_full_size_vis: img_cp = np.copy(img_rs_nopad) for (rect, score) in rects: img_cp = rect.draw_on_image(img_cp, color=[0, 255, 0]) row.append(img_cp) rows.append(np.hstack(row)) # clustered rects img_cp = np.copy(img_rs_nopad) for label in groups: col = np.random.randint(0, 255, size=(3,)) rects = groups[label] for (rect, score) in rects: img_cp = rect.draw_on_image(img_cp, color=col) row = np.hstack([img_rs_nopad, img_cp]) diff = img_rs_nopad.shape[1] * (1+1+9) - row.shape[1] row = np.pad(row, ((0, 0), (0, diff), (0, 0)), mode="constant", constant_values=0) rows.append(row) # final rects img_cp = np.copy(img_rs_nopad) for (rect, score) in final_bbs: col = np.random.randint(0, 255, size=(3,)) img_cp = rect.draw_on_image(img_cp, color=col) row = np.hstack([img_rs_nopad, img_cp]) diff = img_rs_nopad.shape[1] * (1+1+9) - row.shape[1] row = np.pad(row, ((0, 0), (0, diff), (0, 0)), mode="constant", constant_values=0) rows.append(row) #print([r.shape for r in rows]) misc.imshow(np.vstack(rows)) if __name__ == "__main__": main()