from agent.hyperparameters import QNetworkHyperparameters import numpy as np import tensorflow as tf import math import logging from collections import namedtuple TFGraphForwardPassBundle = namedtuple('TFGraphForwardPropBundle', ['input_state', 'output_all_actions_q_values', 'variable_scope_name_prefix']) TFGraphTrainBundle = namedtuple('TFGraphTrainBundle', ['input_states', 'output_all_actions_q_values', 'action_indexes', 'target_action_q_values', 'learning_rate', 'loss', 'optimizer', 'variable_scope_name_prefix']) QNetworkTrainBundle = namedtuple("QNetworkTrainBundle", ["state", "action_index", "target_action_q_value"]) class QNetworkFactory(object): def create(self, screen_width, screen_height, num_channels, num_actions, metrics_directory, batched_forward_pass_size): return QNetwork(screen_width, screen_height, num_channels, num_actions, metrics_directory, batched_forward_pass_size) class QNetwork(object): MODEL_NAME_TRAIN = 'model-train' MODEL_NAME_FORWARD_PASS = 'model-forward-pass' def __init__(self, screen_width, screen_height, num_channels, num_actions, metrics_directory, batched_forward_pass_size, hyperparameters=QNetworkHyperparameters()): self.logger = logging.getLogger(__name__) self.screen_width = screen_width self.screen_height = screen_height self.num_channels = num_channels self.num_actions = num_actions self.batched_forward_pass_size = batched_forward_pass_size self.hyperparameters = hyperparameters self.tf_graph = tf.Graph() self.tf_graph_forward_pass_bundle_single = self._build_graph_forward_pass_bundle(self.tf_graph, 1) self.tf_graph_forward_pass_bundle_batched = self._build_graph_forward_pass_bundle(self.tf_graph, batched_forward_pass_size) self.tf_graph_train_bundle = self._build_graph_train_bundle(self.tf_graph) self.tf_session = tf.Session(graph=self.tf_graph) with self.tf_graph.as_default(): self.tf_all_summaries = tf.merge_all_summaries() self.tf_summary_writer = tf.train.SummaryWriter(logdir=metrics_directory, graph=self.tf_graph) self.tf_saver = tf.train.Saver() tf.initialize_all_variables().run(session=self.tf_session) self.assigns_train_to_forward_pass_variables = self._build_assigns_train_to_forward_pass_variables() def _build_graph_forward_pass_bundle(self, graph, batch_size): with graph.as_default(): input_state = tf.placeholder(tf.float32, shape=(batch_size, self.screen_height, self.screen_width, self.num_channels), name='input_state') variable_scope_name_prefix = "{0}-{1}-scope".format(self.MODEL_NAME_FORWARD_PASS, batch_size) output_all_actions_q_values = self._network_model(variable_scope_name_prefix=variable_scope_name_prefix, input=input_state, output_size=self.num_actions, record_metrics=False) return TFGraphForwardPassBundle(input_state=input_state, output_all_actions_q_values=output_all_actions_q_values, variable_scope_name_prefix=variable_scope_name_prefix) def _build_graph_train_bundle(self, graph): with graph.as_default(): input_states = tf.placeholder(tf.float32, shape=(self.hyperparameters.SGD_BATCH_SIZE, self.screen_height, self.screen_width, self.num_channels), name='input_states') variable_scope_name_prefix=self.MODEL_NAME_TRAIN output_all_actions_q_values = self._network_model(variable_scope_name_prefix=variable_scope_name_prefix, input=input_states, output_size=self.num_actions, record_metrics=True) action_indexes = tf.placeholder(tf.float32, shape=(self.hyperparameters.SGD_BATCH_SIZE, self.num_actions), name='action_indexes') output_filtered_action_q_values = tf.reduce_sum(tf.mul(output_all_actions_q_values, action_indexes), reduction_indices=1) target_action_q_values = tf.placeholder(tf.float32, shape=(self.hyperparameters.SGD_BATCH_SIZE), name='target_action_q_values') delta = target_action_q_values - output_filtered_action_q_values loss = tf.reduce_mean(tf.square(delta)) learning_rate = tf.Variable(self.hyperparameters.LEARNING_RATE_INITIAL, trainable=False) optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=self.hyperparameters.RMS_DECAY, momentum=self.hyperparameters.RMS_MOMENTUM, epsilon=self.hyperparameters.RMS_EPSILON).minimize(loss) tf.scalar_summary('loss', loss) tf.scalar_summary('learning_rate', learning_rate) return TFGraphTrainBundle(input_states=input_states, output_all_actions_q_values=output_all_actions_q_values, action_indexes=action_indexes, target_action_q_values=target_action_q_values, learning_rate=learning_rate, loss=loss, optimizer=optimizer, variable_scope_name_prefix=variable_scope_name_prefix) def _network_model(self, variable_scope_name_prefix, input, output_size, record_metrics): conv1 = self._convolutional_layer(input=input, patch_size=8, stride=4, input_channels=self.num_channels, output_channels=32, bias_init_value=0.0, scope_name=variable_scope_name_prefix + '_conv1') conv2 = self._convolutional_layer(input=conv1, patch_size=4, stride=2, input_channels=32, output_channels=64, bias_init_value=0.1, scope_name=variable_scope_name_prefix + '_conv2') conv3 = self._convolutional_layer(input=conv2, patch_size=3, stride=1, input_channels=64, output_channels=64, bias_init_value=0.1, scope_name=variable_scope_name_prefix + '_conv3') flattened_conv3 = tf.reshape(conv3, [input.get_shape()[0].value, -1]) flattened_conv3_size = flattened_conv3.get_shape()[1].value # relu4 relu4 = self._relu_layer(input=flattened_conv3, input_size=flattened_conv3_size, output_size=512, scope_name=variable_scope_name_prefix + '_relu4') local5 = self._linear_layer(input=relu4, input_size=512, output_size=output_size, scope_name=variable_scope_name_prefix + '_local5') if record_metrics: self._activation_summary(conv1) self._activation_summary(conv2) self._activation_summary(conv3) self._activation_summary(relu4) self._activation_summary(local5) return local5 def _convolutional_layer(self, input, patch_size, stride, input_channels, output_channels, bias_init_value, scope_name): with tf.variable_scope(scope_name) as scope: weights = tf.get_variable(name='weights', shape=[patch_size, patch_size, input_channels, output_channels], initializer=tf.contrib.layers.xavier_initializer_conv2d()) biases = tf.Variable(name='biases', initial_value=tf.constant(value=bias_init_value, shape=[output_channels])) conv = tf.nn.conv2d(input, weights, [1, stride, stride, 1], padding='SAME') linear_rectification_bias = tf.nn.bias_add(conv, biases) output = tf.nn.relu(linear_rectification_bias, name=scope.name) grid_x = output_channels // 4 grid_y = 4 * input_channels kernels_image_grid = self._create_kernels_image_grid(weights, (grid_x, grid_y)) tf.image_summary(scope_name + '/features', kernels_image_grid, max_images=1) if "_conv1" in scope_name: x_min = tf.reduce_min(weights) x_max = tf.reduce_max(weights) weights_0_to_1 = (weights - x_min) / (x_max - x_min) weights_0_to_255_uint8 = tf.image.convert_image_dtype(weights_0_to_1, dtype=tf.uint8) # to tf.image_summary format [batch_size, height, width, channels] weights_transposed = tf.transpose(weights_0_to_255_uint8, [3, 0, 1, 2]) tf.image_summary(scope_name + '/features', weights_transposed[:,:,:,0:1], max_images=32) return output def _relu_layer(self, input, input_size, output_size, scope_name): with tf.variable_scope(scope_name) as scope: weights = tf.get_variable(name='weights', shape=[input_size, output_size], initializer=tf.contrib.layers.xavier_initializer()) biases = tf.Variable(name='biases', initial_value=tf.constant(value=0.1, shape=[output_size])) output = tf.nn.relu(tf.matmul(input, weights) + biases, name=scope.name) return output def _linear_layer(self, input, input_size, output_size, scope_name): with tf.variable_scope(scope_name) as scope: weights = tf.Variable(name='weights', initial_value=tf.truncated_normal(shape=[input_size, output_size], stddev=0.1)) biases = tf.Variable(name='biases', initial_value=tf.constant(value=0.1, shape=[output_size])) output = tf.matmul(input, weights) + biases return output def _activation_summary(self, tensor): tensor_name = tensor.op.name tf.histogram_summary(tensor_name + '/activations', tensor) tf.scalar_summary(tensor_name + '/sparsity', tf.nn.zero_fraction(tensor)) def _create_kernels_image_grid(self, kernel, (grid_X, grid_Y), pad=1): '''Visualize conv. features as an image (mostly for the 1st layer). Place kernel into a grid, with some paddings between adjacent filters. Args: kernel: tensor of shape [Y, X, NumChannels, NumKernels] (grid_X, grid_Y): shape of the grid. Require: NumKernels == grid_X * grid_Y User is responsible of how to break into two multiples. pad: number of black pixels around each filter (between them) Return: Tensor of shape [(Y+pad)*grid_Y, (X+pad)*grid_X, NumChannels, 1]. ''' flattened_kernel = tf.reshape(kernel, tf.pack([kernel.get_shape()[0], kernel.get_shape()[1], 1, kernel.get_shape()[3] * kernel.get_shape()[2]])) # X and Y dimensions, w.r.t. padding Y = flattened_kernel.get_shape()[0] + pad X = flattened_kernel.get_shape()[1] + pad # pad X and Y x1 = tf.pad(flattened_kernel, tf.constant([[pad, 0], [pad, 0], [0, 0], [0, 0]])) # put NumKernels to the 1st dimension x2 = tf.transpose(x1, (3, 0, 1, 2)) # organize grid on Y axis x3 = tf.reshape(x2, tf.pack([grid_X, Y * grid_Y, X, 1])) # switch X and Y axes x4 = tf.transpose(x3, (0, 2, 1, 3)) # organize grid on X axis x5 = tf.reshape(x4, tf.pack([1, X * grid_X, Y * grid_Y, 1])) # back to normal order (not combining with the next step for clarity) x6 = tf.transpose(x5, (2, 1, 3, 0)) # to tf.image_summary order [batch_size, height, width, channels], # where in this case batch_size == 1 x7 = tf.transpose(x6, (3, 0, 1, 2)) # scale to [0, 1] x_min = tf.reduce_min(x7) x_max = tf.reduce_max(x7) x8 = (x7 - x_min) / (x_max - x_min) return x8 def create_train_bundle(self, state, action_index, target_action_q_value): return QNetworkTrainBundle(state=state, action_index=action_index, target_action_q_value=target_action_q_value) def train(self, train_bundles, training_step): train_bundles_len = len(train_bundles) if train_bundles_len < self.hyperparameters.SGD_BATCH_SIZE: self.logger.info('Training bundle is smaller than batch size, skipping train') return offset = self.hyperparameters.SGD_BATCH_SIZE % train_bundles_len batch_train_bundles = self._circular_selection_of_batch(offset, train_bundles, train_bundles_len) batch_input_states = [train_bundle.state for train_bundle in batch_train_bundles] batch_action_indexes = [np.eye(self.num_actions)[train_bundle.action_index] for train_bundle in batch_train_bundles] batch_target_action_q_values = [train_bundle.target_action_q_value for train_bundle in batch_train_bundles] feed_dict = { self.tf_graph_train_bundle.input_states: np.asarray(batch_input_states), self.tf_graph_train_bundle.action_indexes: np.asarray(batch_action_indexes), self.tf_graph_train_bundle.target_action_q_values: np.asarray(batch_target_action_q_values) } with self.tf_session.as_default(): run_result = self.tf_session.run( [self.tf_graph_train_bundle.loss, self.tf_graph_train_bundle.optimizer], feed_dict=feed_dict) evaluated_loss = run_result[0] self.logger.info('Loss: %f' % evaluated_loss) if training_step % self.hyperparameters.NUM_STEPS_ASSIGN_TRAIN_TO_FORWARD_PROP_GRAPH == 0: self.tf_session.run(self.assigns_train_to_forward_pass_variables) self.logger.info("Assigning trained variables to forward pass graph") if (training_step + 1) % self.hyperparameters.LEARNING_RATE_DECAY_STEP == 0: current_learning_rate = self.tf_session.run([self.tf_graph_train_bundle.learning_rate])[0] learning_rate_decay = math.pow(float(self.hyperparameters.LEARNING_RATE_FINAL) / float(self.hyperparameters.LEARNING_RATE_INITIAL), 1.0 / (float(self.hyperparameters.LEARNING_RATE_FINAL_AT_STEP) / float(self.hyperparameters.LEARNING_RATE_DECAY_STEP))) next_learning_rate = current_learning_rate * learning_rate_decay self.tf_session.run(tf.assign(self.tf_graph_train_bundle.learning_rate, next_learning_rate)) if training_step % self.hyperparameters.METRICS_SAVE_STEP == 0: evaluated_all_summaries = self.tf_session.run([self.tf_all_summaries], feed_dict=feed_dict)[0] self.tf_summary_writer.add_summary(evaluated_all_summaries, training_step) return evaluated_loss def _circular_selection_of_batch(self, offset, train_bundles, train_bundles_len): selection_end_of_list = train_bundles[offset:min(train_bundles_len, (offset + self.hyperparameters.SGD_BATCH_SIZE))] selection_beggining_of_list = train_bundles[0:max(0, ((offset + self.hyperparameters.SGD_BATCH_SIZE) - train_bundles_len))] return selection_end_of_list + selection_beggining_of_list def forward_pass_single(self, input_state): return self._forward_pass([input_state], self.tf_graph_forward_pass_bundle_single) def forward_pass_batched(self, input_states): return self._forward_pass(input_states, self.tf_graph_forward_pass_bundle_batched) def _forward_pass(self, input_states, forward_pass_graph_bundle): feed_dict = {forward_pass_graph_bundle.input_state: np.asarray(self._replace_non_existing_states_with_zeroed_states(input_states))} with self.tf_session.as_default(): return self.tf_session.run( [forward_pass_graph_bundle.output_all_actions_q_values], feed_dict=feed_dict)[0] def _replace_non_existing_states_with_zeroed_states(self, states): result = [None] * len(states) for idx, state in enumerate(states): if state is None: result[idx] = np.zeros((self.screen_height, self.screen_width, self.num_channels)) else: result[idx] = state return result def _build_assigns_train_to_forward_pass_variables(self): assigns = [] with self.tf_graph.as_default(): for variable in tf.all_variables(): self._assign_forward_pass_variable_to_train_variable(forward_pass_prefix=self.tf_graph_forward_pass_bundle_single.variable_scope_name_prefix, variable=variable, assigns=assigns) self._assign_forward_pass_variable_to_train_variable(forward_pass_prefix=self.tf_graph_forward_pass_bundle_batched.variable_scope_name_prefix, variable=variable, assigns=assigns) return assigns def _assign_forward_pass_variable_to_train_variable(self, forward_pass_prefix, variable, assigns): if variable.name.startswith(forward_pass_prefix): forward_pass_variable = variable train_variable_name = forward_pass_variable.name.replace(forward_pass_prefix, self.tf_graph_train_bundle.variable_scope_name_prefix) train_variable = [v for v in tf.all_variables() if train_variable_name in v.name][0] assigns.append(forward_pass_variable.assign(train_variable)) self.logger.debug("{target} will be assigned to {source} when summoned".format(target=forward_pass_variable.name, source=train_variable.name)) def save(self, path): with self.tf_session.as_default(): save_path = self.tf_saver.save(self.tf_session, path) self.logger.info("Q Network saved in file: %s" % save_path) def restore(self, path): with self.tf_session.as_default(): self.tf_saver.restore(self.tf_session, path) self.logger.info("Q Network restored from file: %s" % path)