# Copyright(c) 2017 Intel Corporation. # License: MIT See LICENSE file in root directory. GREEN = '\033[1;32m' RED = '\033[1;31m' NOCOLOR = '\033[0m' YELLOW = '\033[1;33m' DEVICE = "MYRIAD" try: from openvino.inference_engine import IENetwork, IECore except: print(RED + '\nPlease make sure your OpenVINO environment variables are set by sourcing the' + YELLOW + ' setupvars.sh ' + RED + 'script found in <your OpenVINO install location>/bin/ folder.\n' + NOCOLOR) exit(1) import sys import numpy as np import cv2 import argparse IOU_THRESHOLD = 0.25 DETECTION_THRESHOLD = 0.10 def parse_args(): parser = argparse.ArgumentParser(description = 'Image classifier using \ IntelĀ® Neural Compute Stick 2.' ) parser.add_argument( '--ir', metavar = 'IR_File', type=str, default = 'tiny-yolo-v1_53000.xml', help = 'Absolute path to the neural network IR xml file.') parser.add_argument( '-l', '--labels', metavar = 'LABEL_FILE', type=str, default = 'labels.txt', help='Absolute path to labels file.') parser.add_argument( '-i', '--image', metavar = 'IMAGE_FILE', type=str, default = '../../data/images/nps_chair.png', help = 'Absolute path to image file.') parser.add_argument( '--threshold', metavar = 'FLOAT', type=float, default = DETECTION_THRESHOLD, help = 'Threshold for detection.') return parser # Interpret the output from a single inference of TinyYolo (GetResult) # and filter out objects/boxes with low probabilities. # output is the array of floats returned from the API GetResult but converted # to float32 format. # input_image_width is the width of the input image # input_image_height is the height of the input image # Returns a list of lists. each of the inner lists represent one found object and contain # the following 6 values: # string that is network classification ie 'cat', or 'chair' etc # float value for box center X pixel location within source image # float value for box center Y pixel location within source image # float value for box width in pixels within source image # float value for box height in pixels within source image # float value that is the probability for the network classification. def filter_objects(inference_result, input_image_width, input_image_height, labels, threshold): # tiny yolo v1 was trained using a 7x7 grid and 2 anchor boxes per grid box with # 20 detection classes # the 20 classes this network was trained on num_classes = len(labels) # should be 20 grid_size = 7 # the image is a 7x7 grid. Each box in the grid is 64x64 pixels anchor_boxes_per_grid_cell = 2 # the number of anchor boxes returned for each grid cell num_coordinates = 4 # number of coordinates # the raw number of floats returned from the inference num_inference_results = len(inference_result) # only keep boxes with probabilities greater than this probability_threshold = threshold # -------------------- Inference result preprocessing -------------------- # Split the Inference result into 3 arrays: class_probabilities, box_confidence_scores, box_coordinates # then Reshape them into the appropriate shapes. # -- Splitting up Inference result # Class probabilities: # 7x7 = 49 grid cells. # 49 grid cells x 20 classes per grid cell = 980 total class probabilities class_probabilities = inference_result[0:980] # Box confidence scores: 7x7 = 49 grid cells. "how likely the box contains an object" # 49 grid cells x 2 boxes per grid cell = 98 box scales box_confidence_scores = inference_result[980:1078] # Box coordinates for all boxes # 98 boxes * 4 box coordinates each = 392 box_coordinates = inference_result[1078:] # -- Reshaping # These values are the class probabilities for each grid # Reshape the probabilities to 7x7x20 (980 total values) class_probabilities = np.reshape(class_probabilities, (grid_size, grid_size, num_classes)) # These values are how likely each box contains an object # Reshape the box confidence scores to 7x7x2 (98 total values) box_confidence_scores = np.reshape(box_confidence_scores, (grid_size, grid_size, anchor_boxes_per_grid_cell)) # These values are the box coordinates for each box # Reshape the boxes coordinates to 7x7x2x4 (392 total values) box_coordinates = np.reshape(box_coordinates, (grid_size, grid_size, anchor_boxes_per_grid_cell, num_coordinates)) # -------------------- Scale the box coordinates to the input image size -------------------- boxes_to_pixel_units(box_coordinates, input_image_width, input_image_height, grid_size) # -------------------- Calculate class confidence scores -------------------- # Find the class confidence scores for each grid. # This is done by multiplying the class probabilities by the box confidence scores # Shape of class confidence scores: 7x7x2x20 (1960 values) class_confidence_scores = np.zeros((grid_size, grid_size, anchor_boxes_per_grid_cell, num_classes)) for box_index in range(anchor_boxes_per_grid_cell): # loop over boxes for class_index in range(num_classes): # loop over classifications class_confidence_scores[:,:,box_index,class_index] = np.multiply(class_probabilities[:,:,class_index], box_confidence_scores[:,:,box_index]) # -------------------- Filter object scores/coordinates/indexes >= threshold -------------------- # Find all scores that are larger than or equal to the threshold using a mask. # Array of 1960 bools: True if >= threshold. otherwise False. score_threshold_mask = np.array(class_confidence_scores>=probability_threshold, dtype='bool') # Using the array of bools, filter all scores >= threshold filtered_scores = class_confidence_scores[score_threshold_mask] # Get tuple of arrays of indexes from the bool array that have a >= score than the threshold # These tuple of array indexes will help to filter out our box coordinates and class indexes # tuple 0 and 1 are the coordinates of the 7x7 grid (values = 0-6) # tuple 2 is the anchor box index (values = 0-1) # tuple 3 is the class indexes (labels) (values = 0-19) box_threshold_mask = np.nonzero(score_threshold_mask) # Use those indexes to find the coordinates for box confidence scores >= than the threshold filtered_box_coordinates = box_coordinates[box_threshold_mask[0], box_threshold_mask[1], box_threshold_mask[2]] # Use those indexes to find the class indexes that have a score >= threshold filtered_class_indexes = np.argmax(class_confidence_scores, axis=3)[box_threshold_mask[0], box_threshold_mask[1], box_threshold_mask[2]] # -------------------- Sort the filtered scores/coordinates/indexes -------------------- # Sort the indexes from highest score to lowest # and then use those indexes to sort box coordinates, scores, class indexes sort_by_highest_score = np.array(np.argsort(filtered_scores))[::-1] # Sort the box coordinates, scores, and class indexes to match filtered_box_coordinates = filtered_box_coordinates[sort_by_highest_score] filtered_scores = filtered_scores[sort_by_highest_score] filtered_class_indexes = filtered_class_indexes[sort_by_highest_score] # -------------------- Filter out duplicates -------------------- # Get mask for boxes that seem to be the same object by calculating iou (intersection over union) # these will filter out duplicate objects duplicate_box_mask = get_duplicate_box_mask(filtered_box_coordinates) # Update the boxes, probabilities and classifications removing duplicates. filtered_box_coordinates = filtered_box_coordinates[duplicate_box_mask] filtered_scores = filtered_scores[duplicate_box_mask] filtered_class_indexes = filtered_class_indexes[duplicate_box_mask] # -------------------- Gather the results -------------------- # Set up list and return class labels, coordinates and scores filtered_results = [] for object_index in range( len( filtered_box_coordinates ) ): filtered_results.append([ labels [ filtered_class_indexes [ object_index ] ], # label of the object filtered_box_coordinates [ object_index ] [ 0 ], # xmin (before image scaling) filtered_box_coordinates [ object_index ] [ 1 ], # ymin (before image scaling) filtered_box_coordinates [ object_index ] [ 2 ], # width (before image scaling) filtered_box_coordinates [ object_index ] [ 3 ], # height (before image scaling) filtered_scores [ object_index ] # object score ]) return filtered_results # creates a mask to remove duplicate objects (boxes) and their related probabilities and classifications # that should be considered the same object. This is determined by how similar the boxes are # based on the intersection-over-union metric. # box_list is as list of boxes (4 floats for centerX, centerY and Length and Width) def get_duplicate_box_mask(box_list): # The intersection-over-union threshold to use when determining duplicates. # objects/boxes found that are over this threshold will be # considered the same object max_iou = IOU_THRESHOLD box_mask = np.ones(len(box_list)) for i in range(len(box_list)): if box_mask[i] == 0: continue for j in range(i + 1, len(box_list)): if get_intersection_over_union(box_list[i], box_list[j]) > max_iou: box_mask[j] = 0.0 filter_iou_mask = np.array(box_mask > 0.0, dtype='bool') return filter_iou_mask # Converts the boxes in box list to pixel units # assumes box_list is the output from the box output from # the tiny yolo network and is [grid_size x grid_size x 2 x 4]. def boxes_to_pixel_units(box_list, image_width, image_height, grid_size): # number of boxes per grid cell boxes_per_cell = 2 # setup some offset values to map boxes to pixels # box_offset will be [[ [0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6]] ...repeated for 7 ] box_offset = np.transpose(np.reshape(np.array([np.arange(grid_size)]*(grid_size*2)),(boxes_per_cell,grid_size, grid_size)),(1,2,0)) # adjust the box center box_list[:,:,:,0] += box_offset box_list[:,:,:,1] += np.transpose(box_offset,(1, 0, 2)) box_list[:,:,:,0:2] = box_list[:,:,:,0:2] / (grid_size * 1.0) # adjust the lengths and widths box_list[:,:,:,2] = np.multiply(box_list[:,:,:,2], box_list[:,:,:,2]) box_list[:,:,:,3] = np.multiply(box_list[:,:,:,3], box_list[:,:,:,3]) #scale the boxes to the image size in pixels box_list[:,:,:,0] *= image_width box_list[:,:,:,1] *= image_height box_list[:,:,:,2] *= image_width box_list[:,:,:,3] *= image_height # Evaluate the intersection-over-union for two boxes # The intersection-over-union metric determines how close # two boxes are to being the same box. The closer the boxes # are to being the same, the closer the metric will be to 1.0 # box_1 and box_2 are arrays of 4 numbers which are the (x, y) # points that define the center of the box and the length and width of # the box. # Returns the intersection-over-union (between 0.0 and 1.0) # for the two boxes specified. def get_intersection_over_union(box_1, box_2): # one diminsion of the intersecting box intersection_dim_1 = min(box_1[0]+0.5*box_1[2],box_2[0]+0.5*box_2[2])-\ max(box_1[0]-0.5*box_1[2],box_2[0]-0.5*box_2[2]) # the other dimension of the intersecting box intersection_dim_2 = min(box_1[1]+0.5*box_1[3],box_2[1]+0.5*box_2[3])-\ max(box_1[1]-0.5*box_1[3],box_2[1]-0.5*box_2[3]) if intersection_dim_1 < 0 or intersection_dim_2 < 0 : # no intersection area intersection_area = 0 else : # intersection area is product of intersection dimensions intersection_area = intersection_dim_1*intersection_dim_2 # calculate the union area which is the area of each box added # and then we need to subtract out the intersection area since # it is counted twice (by definition it is in each box) union_area = box_1[2]*box_1[3] + box_2[2]*box_2[3] - intersection_area; # now we can return the intersection over union iou = intersection_area / union_area return iou # Displays a gui window with an image that contains # boxes and lables for found objects. will not return until # user presses a key. # source_image is the original image for the inference before it was resized or otherwise changed. # filtered_objects is a list of lists (as returned from filter_objects() # each of the inner lists represent one found object and contain # the following 6 values: # string that is network classification ie 'cat', or 'chair' etc # float value for box center X pixel location within source image # float value for box center Y pixel location within source image # float value for box width in pixels within source image # float value for box height in pixels within source image # float value that is the probability for the network classification. def display_objects_in_gui(source_image, filtered_objects, network_input_w, network_input_h): # copy image so we can draw on it. Could just draw directly on source image if not concerned about that. display_image = source_image.copy() source_image_width = source_image.shape[1] source_image_height = source_image.shape[0] x_ratio = float(source_image_width) / network_input_w y_ratio = float(source_image_height) / network_input_h # loop through each box and draw it on the image along with a classification label print('\n Found this many objects in the image: ' + str(len(filtered_objects))) for obj_index in range(len(filtered_objects)): center_x = int(filtered_objects[obj_index][1] * x_ratio) center_y = int(filtered_objects[obj_index][2] * y_ratio) half_width = int(filtered_objects[obj_index][3] * x_ratio)//2 half_height = int(filtered_objects[obj_index][4] * y_ratio)//2 # calculate box (left, top) and (right, bottom) coordinates box_left = max(center_x - half_width, 0) box_top = max(center_y - half_height, 0) box_right = min(center_x + half_width, source_image_width) box_bottom = min(center_y + half_height, source_image_height) print(' - object: ' + YELLOW + str(filtered_objects[obj_index][0]) + NOCOLOR + ' is at left: ' + str(box_left) + ', top: ' + str(box_top) + ', right: ' + str(box_right) + ', bottom: ' + str(box_bottom)) #draw the rectangle on the image. This is hopefully around the object box_color = (0, 255, 0) # green box box_thickness = 2 cv2.rectangle(display_image, (box_left, box_top),(box_right, box_bottom), box_color, box_thickness) # draw the classification label string just above and to the left of the rectangle label_background_color = (70, 120, 70) # greyish green background for text label_text_color = (255, 255, 255) # white text cv2.rectangle(display_image,(box_left, box_top+20),(box_right,box_top), label_background_color, -1) cv2.putText(display_image,filtered_objects[obj_index][0] + ' : %.2f' % filtered_objects[obj_index][5], (box_left+5, box_top+15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, label_text_color, 1) window_name = 'TinyYolo (hit key to exit)' cv2.imshow(window_name, display_image) cv2.moveWindow(window_name, 10, 10) while (True): raw_key = cv2.waitKey(1) # check if the window is visible, this means the user hasn't closed # the window via the X button (may only work with opencv 3.x prop_val = cv2.getWindowProperty(window_name, cv2.WND_PROP_ASPECT_RATIO) if ((raw_key != -1) or (prop_val < 0.0)): # the user hit a key or closed the window (in that order) break def display_info(input_shape, output_shape, image, ir, labels): print() print(YELLOW + 'Tiny Yolo v1: Starting application...' + NOCOLOR) print(' - ' + YELLOW + 'Plugin: ' + NOCOLOR + 'Myriad') print(' - ' + YELLOW + 'IR File: ' + NOCOLOR, ir) print(' - ' + YELLOW + 'Input Shape: ' + NOCOLOR, input_shape) print(' - ' + YELLOW + 'Output Shape:' + NOCOLOR, output_shape) print(' - ' + YELLOW + 'Labels File: ' + NOCOLOR, labels) print(' - ' + YELLOW + 'Image File: ' + NOCOLOR, image) # This function is called from the entry point to do # all the work. def main(): ARGS = parse_args().parse_args() image = ARGS.image labels = ARGS.labels ir = ARGS.ir threshold = ARGS.threshold # Prepare Categories with open(labels) as labels_file: label_list = labels_file.read().splitlines() print(YELLOW + 'Running NCS Caffe TinyYolo example...') ####################### 1. Setup Plugin and Network ####################### # Select the myriad plugin and IRs to be used ie = IECore() net = IENetwork(model = ir, weights = ir[:-3] + 'bin') # Set up the input and output blobs input_blob = next(iter(net.inputs)) output_blob = next(iter(net.outputs)) input_shape = net.inputs[input_blob].shape output_shape = net.outputs[output_blob].shape # Display model information display_info(input_shape, output_shape, image, ir, labels) # Load the network and get the network shape information exec_net = ie.load_network(network = net, device_name = DEVICE) n, c, h, w = input_shape # Read image from file, resize it to network width and height # save a copy in display_image for display, then convert to float32, normalize (divide by 255), # and finally convert to convert to float16 to pass to LoadTensor as input for an inference input_image = cv2.imread(image) display_image = input_image input_image = cv2.resize(input_image, (w, h), cv2.INTER_LINEAR) input_image = input_image.astype(np.float32) input_image = np.transpose(input_image, (2,0,1)) output = exec_net.infer({input_blob: input_image}) output = output[output_blob][0].flatten() filtered_objs = filter_objects(output.astype(np.float32), input_image.shape[1], input_image.shape[2], label_list, threshold) print('\n Displaying image with objects detected in GUI...') print(' Click in the GUI window and hit any key to exit.') # display the filtered objects/boxes in a GUI window display_objects_in_gui(display_image, filtered_objs, input_image.shape[1], input_image.shape[2]) print('\n Finished.') # main entry point for program. we'll call main() to do what needs to be done. if __name__ == "__main__": sys.exit(main())