import numpy as np import random from utils import save_obj, load_obj import torch from torch.utils import data import cv2 import os import h5py import random from ReDWebNet import resNet_data_preprocess def draw(img, target, fname): img_temp = img.copy() color_close = (255, 0, 0) # close is blue color_far = (0, 255, 0) # far is green for i in range(target.shape[1]): x1 = int(target[1, i]); y1 = int(target[0, i]); x2 = int(target[3, i]); y2 = int(target[2, i]); cv2.circle(img_temp,(x1, y1),2,color_far,-1) cv2.circle(img_temp,(x2, y2),2,color_close,-1) cv2.arrowedLine(img_temp, (x2, y2), (x1, y1), (0, 255, 255), 1) cv2.imwrite(fname, img_temp) print "Done writing to %s" % fname class data_augmenter(): def __init__(self, width, height): """ Args: width and height are only used to determine the output aspect ratio, not the actual output size """ self.ops = [] cv2.setNumThreads(0) self.width = float(width) self.height = float(height) def add_rotation(self, probability, max_left_rotation=-10, max_right_rotation=10): self.ops.append({'type':'rotation', 'probability':probability, 'max_left_rotation': max_left_rotation, 'max_right_rotation':max_right_rotation}) def add_zoom(self, probability, min_percentage, max_percentage): self.ops.append({'type':'zoom', 'probability':probability, 'min_percentage': min_percentage, 'max_percentage': max_percentage}) def add_flip_left_right(self, probability): self.ops.append({'type':'flip_lr', 'probability':probability}) def add_crop(self, probability, min_percentage=0.5): self.ops.append({'type':'crop', 'probability':probability, 'min_percentage':min_percentage}) def draw(self, img, target, fname): img_temp = img.copy() color_close = (255, 0, 0) # close is blue color_far = (0, 255, 0) # far is green for i in range(target.shape[1]): x1 = int(target[1, i]); y1 = int(target[0, i]); x2 = int(target[3, i]); y2 = int(target[2, i]); cv2.circle(img_temp,(x1, y1),2,color_far,-1) cv2.circle(img_temp,(x2, y2),2,color_close,-1) cv2.arrowedLine(img_temp, (x2, y2), (x1, y1), (0, 255, 255), 1) cv2.imwrite(fname, img_temp) print "Done writing to %s" % fname def __str__(self): out_str = 'Data Augmenter:\n' for op in self.ops: out_str += '\t' for key in op.keys(): out_str = out_str + str(key) +':'+ str(op[key]) + '\t' out_str += '\n' return out_str def aug(self, img, target): orig_img = img.copy() orig_target = target.copy() for op in self.ops: if random.uniform(0.0, 1.0) <= op['probability']: if op['type'] == 'crop': percentage = random.uniform(op['min_percentage'], 1.0) # print "Cropping.: Percentage = %f" % percentage #################### image if img.shape[0] <= img.shape[1]: dst_h = int(img.shape[0] * percentage) dst_w = min(int(dst_h / self.height * self.width), img.shape[1]) elif img.shape[0] > img.shape[1]: dst_w = int(img.shape[1] * percentage) dst_h = min(int(dst_w / self.width * self.height), img.shape[0]) offset_y = random.randint(0, img.shape[0]- dst_h) offset_x = random.randint(0, img.shape[1]- dst_w) img = img[offset_y:offset_y+dst_h, offset_x:offset_x+dst_w, :] #################### target target[0,:] = target[0,:] - offset_y target[1,:] = target[1,:] - offset_x target[2,:] = target[2,:] - offset_y target[3,:] = target[3,:] - offset_x mask = target[0,:] < dst_h mask = np.logical_and(mask, target[1,:] < dst_w) mask = np.logical_and(mask, target[2,:] < dst_h) mask = np.logical_and(mask, target[3,:] < dst_w) mask = np.logical_and(mask, target[0,:] >= 0) mask = np.logical_and(mask, target[1,:] >= 0) mask = np.logical_and(mask, target[2,:] >= 0) mask = np.logical_and(mask, target[3,:] >= 0) # self.draw(img, target, '2_crop.png') if np.sum(mask) == 0 or np.sum(mask) == 1: return orig_img, orig_target else: target = target[:, mask] elif op['type'] == 'flip_lr': # print "Flipping..................." #################### image img = cv2.flip(img, 1) #################### target target[1,:] = img.shape[1] - target[1,:] target[3,:] = img.shape[1] - target[3,:] # self.draw(img, target, '4_flip.png') elif op['type'] == 'zoom': # print "Zooming..................." #################### image percentage = random.uniform(op['min_percentage'], op['max_percentage']) img = cv2.resize(img, None, fx = percentage, fy = percentage) #################### target target[0:4,:] = target[0:4,:] * percentage # self.draw(img, target, '1_zoom.png') elif op['type'] == 'rotation': # print "Rotating..................." #################### image angle = random.uniform(-op['max_left_rotation'], op['max_right_rotation']) rotation_matrix = cv2.getRotationMatrix2D((img.shape[1]/2, img.shape[0]/2), angle, 1.0) img = cv2.warpAffine(img, rotation_matrix, (img.shape[1], img.shape[0])) #################### target temp = rotation_matrix[0,:].copy() rotation_matrix[0,:] = rotation_matrix[1,:] rotation_matrix[1,:] = temp temp = rotation_matrix[:,0].copy() rotation_matrix[:,0] = rotation_matrix[:,1] rotation_matrix[:,1] = temp target[0:2,:] = rotation_matrix[:,0:2].dot(target[0:2,:]) + rotation_matrix[:,2:3] target[2:4,:] = rotation_matrix[:,0:2].dot(target[2:4,:]) + rotation_matrix[:,2:3] mask = target[0,:] < img.shape[0] mask = np.logical_and(mask, target[1,:] < img.shape[1]) mask = np.logical_and(mask, target[2,:] < img.shape[0]) mask = np.logical_and(mask, target[3,:] < img.shape[1]) mask = np.logical_and(mask, target[0,:] >= 0) mask = np.logical_and(mask, target[1,:] >= 0) mask = np.logical_and(mask, target[2,:] >= 0) mask = np.logical_and(mask, target[3,:] >= 0) if np.sum(mask) == 0 or np.sum(mask) == 1: return orig_img, orig_target else: target = target[:, mask] # self.draw(img, target, '3_rotation.png') return img, target class YoutubeDataset(data.Dataset): def __init__(self, csv_filename, height=240, width=320, b_oppi = False, b_data_aug = False, b_resnet_prep = False): super(YoutubeDataset, self).__init__() print("=====================================================") print "Using YoutubeDataset..." self.parse_youtube_csv(csv_filename) if b_resnet_prep: self.height = 384 self.width = 384 else: self.height = height self.width = width self.n_sample = len(self.img_names) self.b_oppi = b_oppi # only take one relative depth pair per image self.b_resnet_prep = b_resnet_prep self.b_data_aug = b_data_aug print "\t-(width, height): (%d, %d)" % (self.width, self.height) print "\t-%s: %d samples" % (csv_filename, self.n_sample) print "\t-One relative depth pair per image:", self.b_oppi print "\t-Data augmentation:", self.b_data_aug print "\t-Resnet data preprocessing:", self.b_resnet_prep print("=====================================================") if self.b_data_aug: self.da = data_augmenter(width = self.width, height = self.height) self.da.add_zoom(0.8, min_percentage = 0.5, max_percentage = 3.0) self.da.add_crop(1.1, min_percentage = 0.5) self.da.add_rotation(0.8, max_left_rotation = -10.0, max_right_rotation = 10.0) self.da.add_flip_left_right(0.5) print self.da def parse_csv_meta_data(self, csv_filename): img_names = [] pkl_names = [] ## A line in the csv file should look like this: ## ./laundry_room_0001/shot_001/0001.jpg, ./laundry_room_0001/shot_001/colmap/0/0001_0.6_6000_col_reldepth.pkl with open(csv_filename, 'r') as f: while True: line = f.readline() if not line: break infos = line.split(',') img_name, pkl_name = infos[0].strip(), infos[1].strip() img_name = '../data/' + img_name pkl_name = '../data/' + pkl_name img_names.append(img_name) pkl_names.append(pkl_name) return img_names, pkl_names def parse_youtube_csv(self, csv_filename): meta_filename = csv_filename.replace('.csv', '.meta') if not os.path.exists(meta_filename): print meta_filename, "does not exist. Creating..." self.img_names, self.pkl_names = self.parse_csv_meta_data(csv_filename) save_obj({"img_names":self.img_names, "pkl_names":self.pkl_names}, meta_filename, verbal = True) else: print "Loading ", meta_filename temp = load_obj(meta_filename, verbal = True) self.img_names = temp["img_names"] self.pkl_names = temp["pkl_names"] def __getitem__(self, index): # This data reader assumes that the target coordinates are represented # by value in [0, 1.0], i.e., the ratio between the original coordinate # and the original image height / image width color = cv2.imread(self.img_names[index]) target = load_obj(self.pkl_names[index]) assert target.shape[0] == 5 if self.b_oppi and target.shape[1] > 2: rand_idx = 0 target = target[:, rand_idx:rand_idx+1] target[0,:] = target[0,:] * color.shape[0] #y_A _dummy = target[0,:]; _dummy[_dummy>=color.shape[0]] = color.shape[0] - 1; _dummy[_dummy < 0] = 0 target[1,:] = target[1,:] * color.shape[1] #x_A _dummy = target[1,:]; _dummy[_dummy>=color.shape[1]] = color.shape[1] - 1; _dummy[_dummy < 0] = 0 target[2,:] = target[2,:] * color.shape[0] #y_B _dummy = target[2,:]; _dummy[_dummy>=color.shape[0]] = color.shape[0] - 1; _dummy[_dummy < 0] = 0 target[3,:] = target[3,:] * color.shape[1] #x_B _dummy = target[3,:]; _dummy[_dummy>=color.shape[1]] = color.shape[1] - 1; _dummy[_dummy < 0] = 0 # target[:4,:] = target[:4,:] - 1 # the coordinate in python starts from 0!!!! # draw(color, target, '0_orig.png') if self.b_data_aug: color, target = self.da.aug(color, target) target[0,:] = target[0,:] / float(color.shape[0]) * self.height #y_A _dummy = target[0,:]; _dummy[_dummy>=self.height] = self.height - 1; _dummy[_dummy < 0] = 0 target[1,:] = target[1,:] / float(color.shape[1]) * self.width #x_A _dummy = target[1,:]; _dummy[_dummy>=self.width] = self.width - 1; _dummy[_dummy < 0] = 0 target[2,:] = target[2,:] / float(color.shape[0]) * self.height #y_B _dummy = target[2,:]; _dummy[_dummy>=self.height] = self.height - 1; _dummy[_dummy < 0] = 0 target[3,:] = target[3,:] / float(color.shape[1]) * self.width #x_B _dummy = target[3,:]; _dummy[_dummy>=self.width] = self.width - 1; _dummy[_dummy < 0] = 0 color = cv2.resize(color, (self.width, self.height)) # draw(color, target, '5_final.png') # raw_input() color = color.transpose(2, 0, 1).astype(np.float32) / 255.0 if self.b_resnet_prep: color = resNet_data_preprocess(color) return color, target.astype(np.int64), (self.height, self.width) def __len__(self): return self.n_sample class YoutubeDatasetVal(YoutubeDataset): def __init__(self, csv_filename, height=240, width=320, b_oppi = False, b_resnet_prep = False): print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print("\tValidation version of the YoutubeDataset") print("\t\t-It never perform data augmentation") YoutubeDataset.__init__(self, csv_filename, height = height, width = width, b_oppi = b_oppi, b_data_aug = False, b_resnet_prep = b_resnet_prep) def __getitem__(self, index): # This data reader assumes that the target coordinates are represented # by value in [0, 1.0], i.e., the ratio between the original coordinate # and the original image height / image width ##################################################################### color = cv2.imread(self.img_names[index]) orig_img_res = color.shape[:2] color = cv2.resize(color, (self.width, self.height)) color = color.transpose(2, 0, 1).astype(np.float32) / 255.0 if self.b_resnet_prep: color = resNet_data_preprocess(color) ##################################################################### target = load_obj(self.pkl_names[index]) assert target.shape[0] == 5 if self.b_oppi and target.shape[1] > 2: rand_idx = random.randint(0, target.shape[1] - 2) target = target[:, rand_idx:rand_idx+1] target[0,:] = target[0,:] * orig_img_res[0] #y_A _dummy = target[0,:]; _dummy[_dummy>=orig_img_res[0]] = orig_img_res[0] - 1; _dummy[_dummy < 0] = 0 target[1,:] = target[1,:] * orig_img_res[1] #x_A _dummy = target[1,:]; _dummy[_dummy>=orig_img_res[1]] = orig_img_res[1] - 1; _dummy[_dummy < 0] = 0 target[2,:] = target[2,:] * orig_img_res[0] #y_B _dummy = target[2,:]; _dummy[_dummy>=orig_img_res[0]] = orig_img_res[0] - 1; _dummy[_dummy < 0] = 0 target[3,:] = target[3,:] * orig_img_res[1] #x_B _dummy = target[3,:]; _dummy[_dummy>=orig_img_res[1]] = orig_img_res[1] - 1; _dummy[_dummy < 0] = 0 # target[:4,:] = target[:4,:] - 1 # the coordinate in python starts from 0!!!! return color, target.astype(np.int64), orig_img_res