from __future__ import division import os import time import math import numpy as np import tensorflow as tf import tensorflow.contrib.slim as slim from .data_loader import DataLoader from .nets import * from .utils import * from .flowlib import flow_to_image class BaseLearner(object): def __init__(self): pass def build_train_graph(self): raise NotImplementedError def collect_summaries(self): raise NotImplementedError def train(self, opt): raise NotImplementedError # Credit: https://github.com/mrharicot/monodepth/blob/master/average_gradients.py def average_gradients(self, tower_grads): average_grads = [] for grad_and_vars in zip(*tower_grads): grads = [] for g, _ in grad_and_vars: if g is not None: expanded_g = tf.expand_dims(g, 0) grads.append(expanded_g) if grads != []: grad = tf.concat(axis=0, values=grads) grad = tf.reduce_mean(grad, 0) v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads def get_dp_flow(self, opt, s, src_pixel_coords): x_base = tf.range(int(opt.img_width/(2**s))) y_base = tf.range(int(opt.img_height/(2**s))) x_base = tf.stack([x_base]*int(opt.img_height/(2**s)), axis=0) y_base = tf.transpose(tf.stack([y_base]*int(opt.img_width/(2**s)), axis=0)) dp_flow_x = src_pixel_coords[:, :, :, 0] - tf.cast(x_base, tf.float32) dp_flow_y = src_pixel_coords[:, :, :, 1] - tf.cast(y_base, tf.float32) dp_flow = tf.stack([dp_flow_x, dp_flow_y], axis=-1) return dp_flow def get_in_range_mask(self, opt, s, flow): # 1 if the displacement is within the image x_min = 0.0 x_max = int(opt.img_width/(2**s))-1 y_min = 0.0 y_max = int(opt.img_height/(2**s))-1 x_base = tf.range(int(opt.img_width/(2**s))) y_base = tf.range(int(opt.img_height/(2**s))) x_base = tf.stack([x_base]*int(opt.img_height/(2**s)), axis=0) y_base = tf.transpose(tf.stack([y_base]*int(opt.img_width/(2**s)), axis=0)) pos_x = flow[:,:,:,0]+tf.cast(x_base, tf.float32) pos_y = flow[:,:,:,1]+tf.cast(y_base, tf.float32) inside_x = tf.logical_and(pos_x <= tf.cast(x_max, tf.float32), pos_x >= x_min) inside_y = tf.logical_and(pos_y <= tf.cast(y_max, tf.float32), pos_y >= y_min) inside = tf.expand_dims(tf.logical_and(inside_x, inside_y), axis=-1) return tf.stop_gradient(tf.cast(inside, tf.float32)) def get_fb_mask(self, flow, warped_flow, alpha1=0.01, alpha2=0.5): temp1 = tf.reduce_sum(tf.square(flow+warped_flow), axis=3, keep_dims=True) temp2 = tf.reduce_sum(tf.square(flow), axis=3, keep_dims=True)+tf.reduce_sum(tf.square(warped_flow), axis=3, keep_dims=True) occ_mask = tf.greater(temp1, alpha1*temp2+alpha2) return tf.stop_gradient(tf.cast(occ_mask, tf.float32)) # Crecit: https://github.com/simonmeister/UnFlow/blob/master/src/e2eflow/core/losses.py def ternary_loss(self, im1, im2_warped, valid_mask, max_distance=1): patch_size = 2*max_distance+1 with tf.variable_scope('ternary_loss'): def _ternary_transform(image): intensities = tf.image.rgb_to_grayscale(image) * 255 out_channels = patch_size * patch_size w = np.eye(out_channels).reshape((patch_size, patch_size, 1, out_channels)) weights = tf.constant(w, dtype=tf.float32) patches = tf.nn.conv2d(intensities, weights, strides=[1, 1, 1, 1], padding='SAME') transf = patches - intensities transf_norm = transf / tf.sqrt(0.81 + tf.square(transf)) return transf_norm def _hamming_distance(t1, t2): dist = tf.square(t1 - t2) dist_norm = dist / (0.1 + dist) dist_sum = tf.reduce_sum(dist_norm, 3, keep_dims=True) return dist_sum t1 = _ternary_transform(im1) t2 = _ternary_transform(im2_warped) dist = _hamming_distance(t1, t2) transform_mask = self.create_mask(valid_mask, [[max_distance, max_distance], [max_distance, max_distance]]) return self.charbonnier_loss(dist, valid_mask * transform_mask), dist def charbonnier_loss(self, x, mask=None, truncate=None, alpha=0.45, beta=1.0, epsilon=0.001): with tf.variable_scope('charbonnier_loss'): batch, height, width, channels = tf.unstack(tf.shape(x)) normalization = tf.cast(batch * height * width * channels, tf.float32) error = tf.pow(tf.square(x * beta) + tf.square(epsilon), alpha) if mask is not None: error = tf.multiply(mask, error) if truncate is not None: error = tf.minimum(error, truncate) return tf.reduce_sum(error) / normalization def create_mask(self, tensor, paddings): with tf.variable_scope('create_mask'): shape = tf.shape(tensor) inner_width = shape[1] - (paddings[0][0] + paddings[0][1]) inner_height = shape[2] - (paddings[1][0] + paddings[1][1]) inner = tf.ones([inner_width, inner_height]) mask2d = tf.pad(inner, paddings) mask3d = tf.tile(tf.expand_dims(mask2d, 0), [shape[0], 1, 1]) mask4d = tf.expand_dims(mask3d, 3) return tf.stop_gradient(mask4d) # Credit: https://github.com/mrharicot/monodepth/blob/master/monodepth_model.py def SSIM(self, x, y): C1 = 0.01 ** 2 C2 = 0.03 ** 2 mu_x = slim.avg_pool2d(x, 3, 1, 'VALID') mu_y = slim.avg_pool2d(y, 3, 1, 'VALID') sigma_x = slim.avg_pool2d(x ** 2, 3, 1, 'VALID') - mu_x ** 2 sigma_y = slim.avg_pool2d(y ** 2, 3, 1, 'VALID') - mu_y ** 2 sigma_xy = slim.avg_pool2d(x * y , 3, 1, 'VALID') - mu_x * mu_y SSIM_n = (2 * mu_x * mu_y + C1) * (2 * sigma_xy + C2) SSIM_d = (mu_x ** 2 + mu_y ** 2 + C1) * (sigma_x + sigma_y + C2) SSIM = SSIM_n / SSIM_d return tf.clip_by_value((1 - SSIM) / 2, 0, 1) def compute_edge_aware_smooth_loss(self, pred_disp, img): """ Edge-aware L1-norm on first-order gradient """ def gradient(pred): D_dx = -pred[:, :, 1:, :] + pred[:, :, :-1, :] D_dy = -pred[:, 1:, :, :] + pred[:, :-1, :, :] return D_dx, D_dy img_dx, img_dy = gradient(img) disp_dx, disp_dy = gradient(pred_disp) weight_x = tf.exp(-tf.reduce_mean(tf.abs(img_dx), 3, keep_dims=True)) weight_y = tf.exp(-tf.reduce_mean(tf.abs(img_dy), 3, keep_dims=True)) loss = tf.reduce_mean(weight_x*tf.abs(disp_dx)) + tf.reduce_mean(weight_y*tf.abs(disp_dy)) return loss def compute_smooth_loss(self, pred_disp): """ L1-norm on second-order gradient """ def gradient(pred): D_dy = pred[:, 1:, :, :] - pred[:, :-1, :, :] D_dx = pred[:, :, 1:, :] - pred[:, :, :-1, :] return D_dx, D_dy dx, dy = gradient(pred_disp) dx2, dxdy = gradient(dx) dydx, dy2 = gradient(dy) return tf.reduce_mean(tf.abs(dx2)) + \ tf.reduce_mean(tf.abs(dxdy)) + \ tf.reduce_mean(tf.abs(dydx)) + \ tf.reduce_mean(tf.abs(dy2)) def flow_to_image_tf(self, flow): im_stack = [] for i in range(self.opt.batch_size//self.opt.num_gpus): temp = tf.py_func(flow_to_image, [flow[i,:,:,:]], tf.uint8) im_stack.append(temp) return tf.stack(im_stack, axis=0) # Credit: https://github.com/yzcjtr/GeoNet/blob/master/geonet_model.py def spatial_normalize(self, disp): _, curr_h, curr_w, curr_c = disp.get_shape().as_list() disp_mean = tf.reduce_mean(disp, axis=[1,2,3], keep_dims=True) disp_mean = tf.tile(disp_mean, [1, curr_h, curr_w, curr_c]) return disp/disp_mean def build_depth_test_graph(self): input_uint8 = tf.placeholder(tf.uint8, [self.batch_size, self.img_height, self.img_width, 3], name='raw_input') input_mc = self.preprocess_image(input_uint8) with tf.name_scope("depth_prediction"): pred_disp, depth_net_endpoints = disp_net_res50( input_mc, is_training=False) pred_depth = [1./disp for disp in pred_disp] pred_depth = pred_depth[0] self.inputs = input_uint8 self.pred_depth = pred_depth self.depth_epts = depth_net_endpoints # Forward-backward def build_pose_fb_test_graph(self): input_uint8 = tf.placeholder(tf.uint8, [self.batch_size, self.img_height, self.img_width * self.seq_length, 3], name='raw_input') input_mc = self.preprocess_image(input_uint8) loader = DataLoader() tgt_image, src_image_stack = \ loader.batch_unpack_image_sequence( input_mc, self.img_height, self.img_width, self.num_source) with tf.name_scope("pose_prediction"): pred_poses, _ = pose_net_fb( tgt_image, src_image_stack, is_training=False) self.inputs = input_uint8 self.pred_poses = pred_poses[:, :, :6] # Only the first half is used def preprocess_image(self, image, is_dp=True): # Assuming input image is uint8 image = tf.image.convert_image_dtype(image, dtype=tf.float32) if is_dp: return image * 2. -1. else: mean = [104.920005, 110.1753, 114.785955] out = [] for i in range(0, int(image.shape[-1]), 3): r = image[:,:,:,i] - mean[0]/255. g = image[:,:,:,i+1] - mean[1]/255. b = image[:,:,:,i+2] - mean[2]/255. out += [r, g, b] return tf.stack(out, axis=-1) def minus_imagenet_rgb(self, image): mean = [122.7717, 115.9465, 102.9801] image = tf.cast(image, tf.float32) out = [] for i in range(0, int(image.shape[-1]), 3): r = image[:,:,:,i] - mean[0] g = image[:,:,:,i+1] - mean[1] b = image[:,:,:,i+2] - mean[2] out += [r, g, b] return tf.stack(out, axis=-1) def deprocess_image(self, image, is_dp=True): # Assuming input image is float32 if is_dp: image = (image + 1.)/2. else: mean = [104.920005, 110.1753, 114.785955] r = image[:,:,:,0] + mean[0]/255. g = image[:,:,:,1] + mean[1]/255. b = image[:,:,:,2] + mean[2]/255. image = tf.stack([r, g, b], axis=-1) return tf.image.convert_image_dtype(image, dtype=tf.uint8) def setup_inference(self, img_height, img_width, mode, seq_length=3, batch_size=1): self.img_height = img_height self.img_width = img_width self.mode = mode self.batch_size = batch_size if self.mode == 'depth': self.build_depth_test_graph() if self.mode == 'pose': self.seq_length = seq_length self.num_source = seq_length - 1 self.build_pose_fb_test_graph() def inference(self, inputs, sess, mode='depth'): fetches = {} if mode == 'depth': fetches['depth'] = self.pred_depth if mode == 'pose': fetches['pose'] = self.pred_poses results = sess.run(fetches, feed_dict={self.inputs:inputs}) return results def save(self, sess, checkpoint_dir, step): model_name = 'model' print(" [*] Saving checkpoint to %s..." % checkpoint_dir) if step == 'latest': self.saver.save(sess, os.path.join(checkpoint_dir, model_name + '.latest')) else: self.saver.save(sess, os.path.join(checkpoint_dir, model_name), global_step=step) if __name__ == '__main__': model = BaseLearner()