python source code of main

'''
\Description Using object detection and tracker to automatically label data
\Brief Algorithm
Step 1. Using Object Detection to find objects in the current frame
        1.1 If do not detect any object in the frame, then skip this frame and GO BACK TO STEP 1 with next frame
        1.2  If there is any found object in the frame, then SAVE label to files and GO TO STEP 2

Step 2. Using trackers to track all found objects from Step 1.
        2.1 If there is not tracker that miss to track object, then SAVE label to files and GO TO STEP 2
        2.2 If there is any tracker that miss to track object, then GO TO STEP 1

Note:
    - All the classIds now based on the classIds of Object Detectors. Therefor, it is needed
    a way to integerate with `class_list.txt`
'''


#!/bin/python
import argparse
import json
import os
import re
import cv2
import numpy as np
from tqdm import tqdm
from lxml import etree
import xml.etree.cElementTree as ET
import sys
sys.path.insert(0, "..")
from object_detection.tf_object_detection import ObjectDetector
import configparser

#--------------INIT OBJECT DETECTOR---------------------
# Read config
config = configparser.ConfigParser()
config.read('config.ini')

# Init device for using object detection
os.environ['CUDA_VISIBLE_DEVICES'] = config['OBJECT_DETECTOR_PARAMETERS']['CUDA_VISIBLE_DEVICES']

#Get object detection parameters
OBJECT_SCORE_THRESHOLD = float(config['OBJECT_DETECTOR_PARAMETERS']['OBJECT_SCORE_THRESHOLD'])
OBJECT_IDS = config['OBJECT_DETECTOR_PARAMETERS']['OBJECT_IDS']
OBJECT_IDS = [int (str) for str in OBJECT_IDS.split(",")]

# # Path of object detection
graph_model_path = "../object_detection/models/ssdlite_mobilenet_v2_coco_2018_05_09/frozen_inference_graph.pb"
# graph_model_path = "../object_detection/models/mask_rcnn_inception_v2_coco_2018_01_28/frozen_inference_graph.pb"
# graph_model_path = "../object_detection/models/faster_rcnn_nas_coco_2018_01_28/frozen_inference_graph.pb"

# Init object detector
detector = ObjectDetector(graph_path=graph_model_path, score_threshold = OBJECT_SCORE_THRESHOLD,\
                          objIds = OBJECT_IDS)

#----------------END INIT OBJECT DETECTOR----------------------



DELAY = 100 # keyboard delay (in milliseconds)
WITH_QT = False
try:
    cv2.namedWindow('Test')
    cv2.displayOverlay('Test', 'Test QT', 500)
    WITH_QT = True
except cv2.error:
    print('-> Please ignore this error message\n')
cv2.destroyAllWindows()


parser = argparse.ArgumentParser(description='Open-source image labeling tool')
parser.add_argument('-i', '--input_dir', default='input', type=str, help='Path to input directory')
parser.add_argument('-o', '--output_dir', default='output', type=str, help='Path to output directory')
parser.add_argument('-t', '--thickness', default='1', type=int, help='Bounding box and cross line thickness')
args = parser.parse_args()

class_index = 0
img_index = 0
is_last_frame = False
img = None
img_objects = []

INPUT_DIR = args.input_dir
OUTPUT_DIR = args.output_dir

WINDOW_NAME = 'OpenLabeling'
TRACKBAR_IMG = 'Image'
TRACKBAR_CLASS = 'Class'

annotation_formats = {'PASCAL_VOC' : '.xml', 'YOLO_darknet' : '.txt'}
TRACKER_DIR = os.path.join(OUTPUT_DIR, '.tracker')

# selected bounding box
prev_was_double_click = False
LINE_THICKNESS = args.thickness

mouse_x = 0
mouse_y = 0
point_1 = (-1, -1)
point_2 = (-1, -1)



def display_text(text, time):
    if WITH_QT:
        cv2.displayOverlay(WINDOW_NAME, text, time)
    else:
        print(text)

def set_img_index(x):
    global img_index, img
    img_index = x
    img_path = IMAGE_PATH_LIST[img_index]
    img = cv2.imread(img_path)
    text = 'Showing image {}/{}, path: {}'.format(str(img_index), str(last_img_index), img_path)
    display_text(text, 1000)


def set_class_index(x):
    global class_index
    class_index = x
    text = 'Selected class {}/{} -> {}'.format(str(class_index), str(last_class_index), CLASS_LIST[class_index])
    display_text(text, 3000)


def draw_edges(tmp_img):
    blur = cv2.bilateralFilter(tmp_img, 3, 75, 75)
    edges = cv2.Canny(blur, 150, 250, 3)
    edges = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
    # Overlap image and edges together
    tmp_img = np.bitwise_or(tmp_img, edges)
    #tmp_img = cv2.addWeighted(tmp_img, 1 - edges_val, edges, edges_val, 0)
    return tmp_img


def decrease_index(current_index, last_index):
    current_index -= 1
    if current_index < 0:
        current_index = last_index
    return current_index


def increase_index(current_index, last_index):
    global is_last_frame
    current_index += 1
    if current_index > last_index:
        current_index = 0
        is_last_frame = True
    return current_index


def draw_line(img, x, y, height, width, color):
    cv2.line(img, (x, 0), (x, height), color, LINE_THICKNESS)
    cv2.line(img, (0, y), (width, y), color, LINE_THICKNESS)


def yolo_format(class_index, point_1, point_2, width, height):
    # YOLO wants everything normalized
    # Order: class x_center y_center x_width y_height
    x_center = (point_1[0] + point_2[0]) / float(2.0 * width)
    y_center = (point_1[1] + point_2[1]) / float(2.0 * height)
    x_width = float(abs(point_2[0] - point_1[0])) / width
    y_height = float(abs(point_2[1] - point_1[1])) / height
    items = map(str, [class_index, x_center, y_center, x_width, y_height])
    return ' '.join(items)


def voc_format(class_name, point_1, point_2):
    # Order: class_name xmin ymin xmax ymax
    xmin, ymin = min(point_1[0], point_2[0]), min(point_1[1], point_2[1])
    xmax, ymax = max(point_1[0], point_2[0]), max(point_1[1], point_2[1])
    items = map(str, [class_name, xmin, ymin, xmax, ymax])
    return items


def write_xml(xml_str, xml_path):
    # remove blank text before prettifying the xml
    parser = etree.XMLParser(remove_blank_text=True)
    root = etree.fromstring(xml_str, parser)
    # prettify
    xml_str = etree.tostring(root, pretty_print=True)
    # save to file
    with open(xml_path, 'wb') as temp_xml:
        temp_xml.write(xml_str)


def append_bb(ann_path, line, extension):
    if '.txt' in extension:
        with open(ann_path, 'a') as myfile:
            myfile.write(line + '\n') # append line
    elif '.xml' in extension:
        class_name, xmin, ymin, xmax, ymax = line

        tree = ET.parse(ann_path)
        annotation = tree.getroot()

        obj = ET.SubElement(annotation, 'object')
        ET.SubElement(obj, 'name').text = class_name
        ET.SubElement(obj, 'pose').text = 'Unspecified'
        ET.SubElement(obj, 'truncated').text = '0'
        ET.SubElement(obj, 'difficult').text = '0'

        bbox = ET.SubElement(obj, 'bndbox')
        ET.SubElement(bbox, 'xmin').text = xmin
        ET.SubElement(bbox, 'ymin').text = ymin
        ET.SubElement(bbox, 'xmax').text = xmax
        ET.SubElement(bbox, 'ymax').text = ymax

        xml_str = ET.tostring(annotation)
        write_xml(xml_str, ann_path)


def yolo_to_voc(x_center, y_center, x_width, y_height, width, height):
    x_center *= float(width)
    y_center *= float(height)
    x_width *= float(width)
    y_height *= float(height)
    x_width /= 2.0
    y_height /= 2.0
    xmin = int(round(x_center - x_width))
    ymin = int(round(y_center - y_height))
    xmax = int(round(x_center + x_width))
    ymax = int(round(y_center + y_height))
    return xmin, ymin, xmax, ymax


def draw_text(tmp_img, text, center, color, size):
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(tmp_img, text, center, font, 0.6, color, size, cv2.LINE_AA)
    return tmp_img


def get_xml_object_data(obj):
    class_name = obj.find('name').text
    class_index = CLASS_LIST.index(class_name)
    bndbox = obj.find('bndbox')
    xmin = int(bndbox.find('xmin').text)
    xmax = int(bndbox.find('xmax').text)
    ymin = int(bndbox.find('ymin').text)
    ymax = int(bndbox.find('ymax').text)
    return [class_name, class_index, xmin, ymin, xmax, ymax]


def draw_bboxes_from_file(tmp_img, annotation_paths, width, height):
    global img_objects
    img_objects = []
    ann_path = next(path for path in annotation_paths if 'PASCAL_VOC' in path)
    if os.path.isfile(ann_path):
        tree = ET.parse(ann_path)
        annotation = tree.getroot()
        for obj in annotation.findall('object'):
            class_name, class_index, xmin, ymin, xmax, ymax = get_xml_object_data(obj)
            #print('{} {} {} {} {}'.format(class_index, xmin, ymin, xmax, ymax))
            img_objects.append([class_index, xmin, ymin, xmax, ymax])
            color = class_rgb[class_index].tolist()
            cv2.rectangle(tmp_img, (xmin, ymin), (xmax, ymax), color, LINE_THICKNESS)
            tmp_img = draw_text(tmp_img, class_name, (xmin, ymin - 5), color, LINE_THICKNESS)
    return tmp_img


def get_bbox_area(x1, y1, x2, y2):
    width = abs(x2 - x1)
    height = abs(y2 - y1)
    return width*height


def get_close_icon(x1, y1, x2, y2):
    percentage = 0.05
    height = -1
    while height < 15 and percentage < 1.0:
        height = int((y2 - y1) * percentage)
        percentage += 0.1
    return (x2 - height), y1, x2, (y1 + height)


def draw_close_icon(tmp_img, x1_c, y1_c, x2_c, y2_c):
    red = (0,0,255)
    cv2.rectangle(tmp_img, (x1_c + 1, y1_c - 1), (x2_c, y2_c), red, -1)
    white = (255, 255, 255)
    cv2.line(tmp_img, (x1_c, y1_c), (x2_c, y2_c), white, 2)
    cv2.line(tmp_img, (x1_c, y2_c), (x2_c, y1_c), white, 2)
    return tmp_img


def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in _nsre.split(s)]

def convert_video_to_images(video_path, n_frames, desired_img_format):
    # create folder to store images (if video was not converted to images already)
    file_path, file_extension = os.path.splitext(video_path)
    # append extension to avoid collision of videos with same name
    # e.g.: `video.mp4`, `video.avi` -> `video_mp4/`, `video_avi/`
    file_extension = file_extension.replace('.', '_')
    file_path += file_extension
    video_name_ext = os.path.basename(file_path)
    if not os.path.exists(file_path):
        print(' Converting video to individual frames...')
        cap = cv2.VideoCapture(video_path)
        os.makedirs(file_path)
        # read the video
        for i in tqdm(range(n_frames)):
            if not cap.isOpened():
                break
            # capture frame-by-frame
            ret, frame = cap.read()
            if ret == True:
                # save each frame (we use this format to avoid repetitions)
                frame_name =  '{}_{}{}'.format(video_name_ext, i, desired_img_format)
                frame_path = os.path.join(file_path, frame_name)
                cv2.imwrite(frame_path, frame)
        # release the video capture object
        cap.release()
    return file_path, video_name_ext


def nonblank_lines(f):
    for l in f:
        line = l.rstrip()
        if line:
            yield line


def get_annotation_paths(img_path, annotation_formats):
    annotation_paths = []
    for ann_dir, ann_ext in annotation_formats.items():
        new_path = os.path.join(OUTPUT_DIR, ann_dir)
        new_path = img_path.replace(INPUT_DIR, new_path, 1)
        pre_path, img_ext = os.path.splitext(new_path)
        new_path = new_path.replace(img_ext, ann_ext, 1)
        annotation_paths.append(new_path)
    return annotation_paths


def create_PASCAL_VOC_xml(xml_path, abs_path, folder_name, image_name, img_height, img_width, depth):
    # By: Jatin Kumar Mandav
    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = folder_name
    ET.SubElement(annotation, 'filename').text = image_name
    ET.SubElement(annotation, 'path').text = abs_path
    source = ET.SubElement(annotation, 'source')
    ET.SubElement(source, 'database').text = 'Unknown'
    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = img_width
    ET.SubElement(size, 'height').text = img_height
    ET.SubElement(size, 'depth').text = depth
    ET.SubElement(annotation, 'segmented').text = '0'

    xml_str = ET.tostring(annotation)
    write_xml(xml_str, xml_path)


def save_bounding_box(annotation_paths, class_index, point_1, point_2, width, height):
    for ann_path in annotation_paths:
        if '.txt' in ann_path:
            line = yolo_format(class_index, point_1, point_2, width, height)
            append_bb(ann_path, line, '.txt')
        elif '.xml' in ann_path:
            line = voc_format(CLASS_LIST[class_index], point_1, point_2)
            append_bb(ann_path, line, '.xml')

def is_frame_from_video(img_path):
    for video_name in VIDEO_NAME_DICT:
        video_dir = os.path.join(INPUT_DIR, video_name)
        if os.path.dirname(img_path) == video_dir:
            # image belongs to a video
            return True, video_name
    return False, None


def get_json_file_data(json_file_path):
    if os.path.isfile(json_file_path):
        with open(json_file_path) as f:
            data = json.load(f)
            return True, data
    else:
        return False, {'n_anchor_ids':0, 'frame_data_dict':{}}


def get_prev_frame_path_list(video_name, img_path):
    first_index = VIDEO_NAME_DICT[video_name]['first_index']
    last_index = VIDEO_NAME_DICT[video_name]['last_index']
    img_index = IMAGE_PATH_LIST.index(img_path)
    return IMAGE_PATH_LIST[first_index:img_index]


def get_next_frame_path_list(video_name, img_path):
    first_index = VIDEO_NAME_DICT[video_name]['first_index']
    last_index = VIDEO_NAME_DICT[video_name]['last_index']
    img_index = IMAGE_PATH_LIST.index(img_path)
    return IMAGE_PATH_LIST[(img_index + 1):last_index]


def get_json_object_dict(obj, json_object_list):
    if len(json_object_list) > 0:
        class_index, xmin, ymin, xmax, ymax = map(int, obj)
        for d in json_object_list:
                    if ( d['class_index'] == class_index and
                         d['bbox']['xmin'] == xmin and
                         d['bbox']['ymin'] == ymin and
                         d['bbox']['xmax'] == xmax and
                         d['bbox']['ymax'] == ymax ) :
                        return d
    return None


def remove_already_tracked_objects(object_list, img_path, json_file_data):
    frame_data_dict = json_file_data['frame_data_dict']
    json_object_list = get_json_file_object_list(img_path, frame_data_dict)
    # copy the list since we will be deleting elements without restarting the loop
    temp_object_list = object_list[:]
    for obj in temp_object_list:
        obj_dict = get_json_object_dict(obj, json_object_list)
        if obj_dict is not None:
            object_list.remove(obj)
            json_object_list.remove(obj_dict)
    return object_list


def get_json_file_object_by_id(json_object_list, anchor_id):
    for obj_dict in json_object_list:
        if obj_dict['anchor_id'] == anchor_id:
            return obj_dict
    return None


def get_json_file_object_list(img_path, frame_data_dict):
    object_list = []
    if img_path in frame_data_dict:
        object_list = frame_data_dict[img_path]
    return object_list


def json_file_add_object(frame_data_dict, img_path, anchor_id, pred_counter, obj):
    object_list = get_json_file_object_list(img_path, frame_data_dict)
    class_index, xmin, ymin, xmax, ymax = obj

    bbox = {
      'xmin': xmin,
      'ymin': ymin,
      'xmax': xmax,
      'ymax': ymax
    }

    temp_obj = {
      'anchor_id': anchor_id,
      'prediction_index': pred_counter,
      'class_index': class_index,
      'bbox': bbox
    }

    object_list.append(temp_obj)
    frame_data_dict[img_path] = object_list

    return frame_data_dict


class Tracker:
    ''' Special thanks to Rafael Caballero Gonzalez '''
    # extract the OpenCV version info, e.g.:
    # OpenCV 3.3.4 -> [major_ver].[minor_ver].[subminor_ver]
    (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')

    # TODO: press ESC to stop the tracking process

    def __init__(self, tracker_type, anchorId, classId):
        self.instance = self.call_tracker_constructor(tracker_type) # Tracker instance
        self.classId = classId # Id of object such as people, bicycle,...
        self.anchorId = anchorId # Id of tracker

    def call_tracker_constructor(self, tracker_type):
        # -- TODO: remove this if I assume OpenCV version > 3.4.0
        if int(self.major_ver == 3) and int(self.minor_ver) < 3:
            tracker = cv2.Tracker_create(tracker_type)
        # --
        else:
            if tracker_type == 'CSRT':
                tracker = cv2.TrackerCSRT_create()
            elif tracker_type == 'KCF':
                tracker = cv2.TrackerKCF_create()
            elif tracker_type == 'MOSSE':
                tracker = cv2.TrackerMOSSE_create()
            elif tracker_type == 'MIL':
                tracker = cv2.TrackerMIL_create()
            elif tracker_type == 'BOOSTING':
                tracker = cv2.TrackerBoosting_create()
            elif tracker_type == 'MEDIANFLOW':
                tracker = cv2.TrackerMedianFlow_create()
            elif tracker_type == 'TLD':
                tracker = cv2.TrackerTLD_create()
            elif tracker_type == 'GOTURN':
                tracker = cv2.TrackerGOTURN_create()
        return tracker

'''
\brief This is class to manage all the trackers
       This class will check if there is any "miss" tracker in current(init) frame:
            + If there is, stop TrackerManager
            + If there is not, using trackers to predict objects in the next frame. Continue until there is any "miss" tracker
'''
class TrackerManager:
    ''' Special thanks to Rafael Caballero Gonzalez '''
    # extract the OpenCV version info, e.g.:
    # OpenCV 3.3.4 -> [major_ver].[minor_ver].[subminor_ver]
    (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.')

    # TODO: press ESC to stop the tracking process

    def __init__(self, tracker_type, init_frame, next_frame_path_list):
        tracker_types = ['CSRT', 'KCF', 'MOSSE', 'MIL', 'BOOSTING', 'MEDIANFLOW', 'TLD', 'GOTURN']
        ''' Recomended tracker_type:
              KCF -> KCF is usually very good (minimum OpenCV 3.1.0)
              CSRT -> More accurate than KCF but slightly slower (minimum OpenCV 3.4.2)
              MOSSE -> Less accurate than KCF but very fast (minimum OpenCV 3.4.1)
        '''
        self.tracker_type = tracker_type
        # -- TODO: remove this if I assume OpenCV version > 3.4.0
        if tracker_type == tracker_types[0] or tracker_type == tracker_types[2]:
            if int(self.major_ver == 3) and int(self.minor_ver) < 4:
                self.tracker_type = tracker_types[1]  # Use KCF instead of CSRT or MOSSE
        # --
        self.init_frame = init_frame
        self.next_frame_path_list = next_frame_path_list
        self.trackers = []
        self.img_h, self.img_w = init_frame.shape[:2]


    '''
    \brief Init trackers
    '''
    def init_trackers(self, bboxes, classIds, json_file_data,json_file_path, img_path):
        global img_index

        anchor_id = json_file_data['n_anchor_ids']
        frame_data_dict = json_file_data['frame_data_dict']
        image = cv2.imread(img_path)
        for box, classId in zip(bboxes, classIds):
            # Init trackers with those classId and anchorId
            anchor_id = anchor_id + 1
            tracker = Tracker(self.tracker_type, anchorId=anchor_id, classId=classId)
            initial_bbox = (box[0], box[1], box[2], box[3])
            tracker.instance.init(self.init_frame, initial_bbox)
            self.trackers.append(tracker)

            # Initialize trackers on json files.
            pred_counter  = 0
            xmin, ymin, w, h = map(int, box)
            xmax = xmin + w
            ymax = ymin + h
            obj = [int(classId), xmin, ymin, xmax, ymax]
            frame_data_dict = json_file_add_object(frame_data_dict, img_path, anchor_id, pred_counter, obj)

            # Save prediction
            annotation_paths = get_annotation_paths(img_path, annotation_formats)
            save_bounding_box(annotation_paths, int(classId), (xmin, ymin), (xmax, ymax), self.img_w, self.img_h)



            #Draw
            color = class_rgb[int(tracker.classId)].tolist()
            cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color, LINE_THICKNESS)

        img_index = increase_index(img_index, last_img_index)
        cv2.setTrackbarPos(TRACKBAR_IMG, WINDOW_NAME, img_index)
        cv2.imshow(WINDOW_NAME, image)
        pressed_key = cv2.waitKey(DELAY)

        json_file_data.update({'n_anchor_ids': (anchor_id + 1)})

        # save the updated data
        with open(json_file_path, 'w') as outfile:
            json.dump(json_file_data, outfile, sort_keys=True, indent=4)


    def predict_next_frames(self,json_file_data,json_file_path):
        global img_index

        anchor_id = json_file_data['n_anchor_ids']
        frame_data_dict = json_file_data['frame_data_dict']

        pred_counter = 0
        for frame_path in self.next_frame_path_list:
            is_there_missed_tracker = False
            bboxes = []
            # Check if there is any "miss" tracker
            for tracker in self.trackers:
                next_image = cv2.imread(frame_path)
                success, bbox = tracker.instance.update(next_image.copy())
                bboxes.append(bbox)
                if not success:
                    is_there_missed_tracker = True
                    break

            # if there is no "miss" tracker, then save labelled objects into files and keep predict at the next frame
            if not is_there_missed_tracker:
                pred_counter += 1
                for i, tracker in enumerate(self.trackers):
                    box = bboxes[i]

                    xmin, ymin, w, h = map(int, box)
                    xmax = xmin + w
                    ymax = ymin + h
                    obj = [int(tracker.classId), xmin, ymin, xmax, ymax]
                    frame_data_dict = json_file_add_object(frame_data_dict, frame_path, int(tracker.anchorId), pred_counter, obj)

                    color = class_rgb[int(tracker.classId)].tolist()
                    cv2.rectangle(next_image, (xmin, ymin), (xmax, ymax), color, LINE_THICKNESS)

                    # save prediction
                    annotation_paths = get_annotation_paths(frame_path, annotation_formats)
                    save_bounding_box(annotation_paths, int(tracker.classId), (xmin, ymin), (xmax, ymax), self.img_w, self.img_h)



                cv2.imshow(WINDOW_NAME, next_image)
                pressed_key = cv2.waitKey(DELAY)

                img_index = increase_index(img_index, last_img_index)

                cv2.setTrackbarPos(TRACKBAR_IMG, WINDOW_NAME, img_index)
            # If there is "miss" traker, then break Tracker Manager.
            # Note:Ready to use "Object Detection" to detect object
            else:
                break

        json_file_data.update({'n_anchor_ids': (anchor_id + 1)})
        # save the updated data
        with open(json_file_path, 'w') as outfile:
            json.dump(json_file_data, outfile, sort_keys=True, indent=4)




# change to the directory of this script
os.chdir(os.path.dirname(os.path.abspath(__file__)))

# load all images and videos (with multiple extensions) from a directory using OpenCV
IMAGE_PATH_LIST = []
VIDEO_NAME_DICT = {}
for f in sorted(os.listdir(INPUT_DIR), key = natural_sort_key):
    f_path = os.path.join(INPUT_DIR, f)
    if os.path.isdir(f_path):
        # skip directories
        continue
    # check if it is an image
    test_img = cv2.imread(f_path)
    if test_img is not None:
        IMAGE_PATH_LIST.append(f_path)
    else:
        # test if it is a video
        test_video_cap = cv2.VideoCapture(f_path)
        n_frames = int(test_video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
        test_video_cap.release()
        if n_frames > 0:
            # it is a video
            desired_img_format = '.jpg'
            video_frames_path, video_name_ext = convert_video_to_images(f_path, n_frames, desired_img_format)
            # add video frames to image list
            frame_list = sorted(os.listdir(video_frames_path), key = natural_sort_key)
            ## store information about those frames
            first_index = len(IMAGE_PATH_LIST)
            last_index = first_index + len(frame_list) # exclusive
            indexes_dict = {}
            indexes_dict['first_index'] = first_index
            indexes_dict['last_index'] = last_index
            VIDEO_NAME_DICT[video_name_ext] = indexes_dict
            IMAGE_PATH_LIST.extend((os.path.join(video_frames_path, frame) for frame in frame_list))
last_img_index = len(IMAGE_PATH_LIST) - 1

# create output directories
if len(VIDEO_NAME_DICT) > 0:
    if not os.path.exists(TRACKER_DIR):
        os.makedirs(TRACKER_DIR)
for ann_dir in annotation_formats:
    new_dir = os.path.join(OUTPUT_DIR, ann_dir)
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)
    for video_name_ext in VIDEO_NAME_DICT:
        new_video_dir = os.path.join(new_dir, video_name_ext)
        if not os.path.exists(new_video_dir):
            os.makedirs(new_video_dir)

# create empty annotation files for each image, if it doesn't exist already
for img_path in IMAGE_PATH_LIST:
    # image info for the .xml file
    test_img = cv2.imread(img_path)
    abs_path = os.path.abspath(img_path)
    folder_name = os.path.dirname(img_path)
    image_name = os.path.basename(img_path)
    img_height, img_width, depth = (str(number) for number in test_img.shape)

    for ann_path in get_annotation_paths(img_path, annotation_formats):
        if not os.path.isfile(ann_path):
            if '.txt' in ann_path:
                open(ann_path, 'a').close()
            elif '.xml' in ann_path:
                create_PASCAL_VOC_xml(ann_path, abs_path, folder_name, image_name, img_height, img_width, depth)

# load class list
with open('class_list.txt') as f:
    CLASS_LIST = list(nonblank_lines(f))
#print(CLASS_LIST)
last_class_index = len(CLASS_LIST) - 1

# Make the class colors the same each session
# The colors are in BGR order because we're using OpenCV
class_rgb = [
    (0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 255, 0), (0, 255, 255),
    (255, 0, 255), (192, 192, 192), (128, 128, 128), (128, 0, 0),
    (128, 128, 0), (0, 128, 0), (128, 0, 128), (0, 128, 128), (0, 0, 128)]
class_rgb = np.array(class_rgb)
# If there are still more classes, add new colors randomly
num_colors_missing = len(CLASS_LIST) - len(class_rgb)
if num_colors_missing > 0:
    more_colors = np.random.randint(0, 255+1, size=(num_colors_missing, 3))
    class_rgb = np.vstack([class_rgb, more_colors])

# create window
cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_KEEPRATIO)
cv2.resizeWindow(WINDOW_NAME, 1000, 700)

# selected image
cv2.createTrackbar(TRACKBAR_IMG, WINDOW_NAME, 0, last_img_index, set_img_index)

# selected class
if last_class_index != 0:
    cv2.createTrackbar(TRACKBAR_CLASS, WINDOW_NAME, 0, last_class_index, set_class_index)

# initialize
set_img_index(0)
edges_on = False
display_text('Welcome!\n Press [h] for help.', 4000)

# loop
while True:
    if is_last_frame:
        print("Reach to the last frame!!!!")
        break

    color = class_rgb[class_index].tolist()
    # clone the img
    tmp_img = img.copy()
    height, width = tmp_img.shape[:2]
    if edges_on == True:
        # draw edges
        tmp_img = draw_edges(tmp_img)

    # draw vertical and horizontal guide lines
    draw_line(tmp_img, mouse_x, mouse_y, height, width, color)

    # write selected class
    tmp_img = draw_text(tmp_img, CLASS_LIST[class_index], (mouse_x + 5, mouse_y - 5), color, LINE_THICKNESS)
    img_path = IMAGE_PATH_LIST[img_index]
    annotation_paths = get_annotation_paths(img_path, annotation_formats)

    # draw already done bounding boxes
    tmp_img = draw_bboxes_from_file(tmp_img, annotation_paths, width, height)


    """ Algorithms for automatically labeling!!!!!!!
    #1. Using object detection to find objects. Then save them into annotation path

    #2. If len(objects) == 0, then increase index and come back to Step 1
    #   If len(objects) >  0, then using init TrackerManager to automatically labeling for later frames:
    #   2.1 In TrackerManager, if detect there is any miss detection of any tracker, then comeback to Step 1.
    """

    # Using object detection to find PEOPLE
    print("Using people detection!!!!")
    im_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    boxes, confidences, classIds =  detector.detect(im_rgb)

    if not len(boxes):
        cv2.imshow(WINDOW_NAME, tmp_img)
        pressed_key = cv2.waitKey(DELAY)
        img_index = increase_index(img_index, last_img_index)
        cv2.setTrackbarPos(TRACKBAR_IMG, WINDOW_NAME, img_index)

    else:
        print("Using tracker!!!!")
        current_img_path = IMAGE_PATH_LIST[img_index]
        is_from_video, video_name = is_frame_from_video(current_img_path)

        # If it is video
        if is_from_video:
            next_frame_path_list = get_next_frame_path_list(video_name, current_img_path)
            json_file_path = '{}.json'.format(os.path.join(TRACKER_DIR, video_name))
            file_exists, json_file_data = get_json_file_data(json_file_path)
            init_frame = img.copy()

            tracker_manager = TrackerManager('KCF', init_frame, next_frame_path_list)
            tracker_manager.init_trackers(boxes, classIds, json_file_data, json_file_path, current_img_path)
            tracker_manager.predict_next_frames(json_file_data,json_file_path)

        else: # If it is image
            for box, classId in zip(boxes, classIds):
                save_bounding_box(annotation_paths, classId, (int(box[0]), int(box[1])),
                                  (int(box[0]) + int(box[2]), int(box[1]) + int(box[3])), width, height)

                xmin, ymin, w, h = map(int, box)
                xmax = xmin + w
                ymax = ymin + h
                color = class_rgb[int(classId)].tolist()
                cv2.rectangle(tmp_img, (xmin, ymin), (xmax, ymax), color, LINE_THICKNESS)

            img_index = increase_index(img_index, last_img_index)
            cv2.setTrackbarPos(TRACKBAR_IMG, WINDOW_NAME, img_index)

            cv2.imshow(WINDOW_NAME, tmp_img)
            pressed_key = cv2.waitKey(DELAY)





    if WITH_QT:
        # if window gets closed then quit
        if cv2.getWindowProperty(WINDOW_NAME, cv2.WND_PROP_VISIBLE) < 1:
            break

cv2.destroyAllWindows()