import numpy as np import pandas as pd import tensorflow as tf import scipy import threading import urllib from multiprocessing.dummy import Pool as ThreadPool from multiprocessing import Process import multiprocessing as mp import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt # converter = tf.contrib.lite.TocoConverter.from_saved_model(saved_model_dir) # converter.post_training_quantize = True # tflite_quantized_model = converter.convert() # open("quantized_model.tflite", "wb").write(tflite_quantized_model) # Deep Q Network off-policy class DeepQNetwork: def __init__( self, n_actions, n_features, # learning_rate=0.001, reward_decay=0.99, e_greedy=0.9, replace_target_iter=300, memory_size=500, batch_size=20, #32 e_greedy_increment=None, output_graph=False, ): self.n_actions = n_actions self.n_features = n_features # self.lr = learning_rate self.gamma = reward_decay self.epsilon_max = e_greedy self.replace_target_iter = replace_target_iter self.memory_size = memory_size if not hasattr(self, 'memory_counter'): self.memory_counter = 0 self.batch_size = batch_size self.epsilon_increment = e_greedy_increment self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max self.save_file = './weights/model.ckpt' self.lo=threading.Lock() # self.lo=mp.Lock() # total learning step self.learn_step_counter = 0 # initialize zero memory [s, a, r, s_] self.memory = np.zeros((self.memory_size, n_features * 2 + 2)) # consist of [target_net, evaluate_net] self._build_net() t_params = tf.get_collection('target_net_params') e_params = tf.get_collection('eval_net_params') self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] #gpu setting config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.per_process_gpu_memory_fraction = 0.4 self.sess = tf.Session(config=config) if output_graph: # $ tensorboard --logdir=logs # tf.train.SummaryWriter soon be deprecated, use following tf.summary.FileWriter("logs/", self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.cost_his = [] self.step_set=[] def full_batch_norm(self, x, n_out, phase_train=tf.constant(False, dtype=tf.bool), scope='bn'): """ Batch normalization on convolutional maps. Args: x: Tensor, 4D BHWD input maps n_out: integer, depth of input maps phase_train: boolean tf.Varialbe, true indicates training phase scope: string, variable scope Return: normed: batch-normalized maps """ with tf.variable_scope(scope): beta = tf.Variable(tf.constant(0.0, shape=[n_out]), name='beta', trainable=True) gamma = tf.Variable(tf.constant(1.0, shape=[n_out]), name='gamma', trainable=True) batch_mean, batch_var = tf.nn.moments(x, [0], name='moments') ema = tf.train.ExponentialMovingAverage(decay=0.5) def mean_var_with_update(): ema_apply_op = ema.apply([batch_mean, batch_var]) with tf.control_dependencies([ema_apply_op]): return tf.identity(batch_mean), tf.identity(batch_var) mean, var = tf.cond(phase_train, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var))) normed = tf.nn.batch_normalization(x, mean, var, beta, gamma, 1e-3) return normed def test_set(self): self.epsilon = 1 self.epsilon_increment = 0 def _build_net(self): # ------------------ build evaluate_net ------------------ self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss self.lr= tf.placeholder(tf.float32, name='learning_rate') with tf.variable_scope('eval_net'): # c_names(collections_names) are the collections to store variables c_names, n_l1, n_l2,n_l3,w_initializer, b_initializer = \ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES],200,100, 40, \ tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) l1 = tf.nn.relu(self.full_batch_norm(tf.matmul(self.s, w1) + b1, n_l1)) # second layer. collections is used later when assign to target net with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, n_l2], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, n_l2], initializer=b_initializer, collections=c_names) l2 = tf.nn.relu(self.full_batch_norm(tf.matmul(l1, w2) + b2, n_l2)) # second layer. collections is used later when assign to target net with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [n_l2, n_l3], initializer=w_initializer, collections=c_names) b3 = tf.get_variable('b3', [1, n_l3], initializer=b_initializer, collections=c_names) l3 = tf.nn.relu(self.full_batch_norm(tf.matmul(l2, w3) + b3, n_l3)) with tf.variable_scope('l3'): w4 = tf.get_variable('w4', [n_l3, self.n_actions], initializer=w_initializer, collections=c_names) b4 = tf.get_variable('b4', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.q_eval = tf.matmul(l3, w4) + b4 with tf.variable_scope('loss'): self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval)) with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) # ------------------ build target_net ------------------ self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input with tf.variable_scope('target_net'): # c_names(collections_names) are the collections to store variables c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] # first layer. collections is used later when assign to target net with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names) b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names) l1 = tf.nn.relu(self.full_batch_norm(tf.matmul(self.s_, w1) + b1, n_l1)) # second layer. collections is used later when assign to target net with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [n_l1, n_l2], initializer=w_initializer, collections=c_names) b2 = tf.get_variable('b2', [1, n_l2], initializer=b_initializer, collections=c_names) l2 = tf.nn.relu(self.full_batch_norm(tf.matmul(l1, w2) + b2, n_l2)) # second layer. collections is used later when assign to target net with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [n_l2, n_l3], initializer=w_initializer, collections=c_names) b3 = tf.get_variable('b3', [1, n_l3], initializer=b_initializer, collections=c_names) l3 = tf.nn.relu(self.full_batch_norm(tf.matmul(l2, w3) + b3, n_l3)) with tf.variable_scope('l3'): w4 = tf.get_variable('w4', [n_l3, self.n_actions], initializer=w_initializer, collections=c_names) b4 = tf.get_variable('b4', [1, self.n_actions], initializer=b_initializer, collections=c_names) self.q_next = tf.matmul(l3, w4) + b4 def store_transition(self, s, a, r, s_): self.lo.acquire() s=s.reshape(-1) s_=s_.reshape(-1) transition = np.hstack((s, [a, r], s_)) #transition = np.column_stack((s, [a, r], s_)) #transition = np.concatenate((s, [a, r], s_), axis=1) #transition = scipy.sparse.hstack([s, [a, r], s_]).toarray() # replace the old memory with new memory index = self.memory_counter % self.memory_size self.memory[index, :] = transition self.memory_counter += 1 self.lo.release() # print(index) def choose_action(self, observation): # to have batch dimension when feed into tf placeholder observation = observation[np.newaxis, :] if np.random.uniform() < self.epsilon: # forward feed the observation and get q value for every actions actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) action = np.argmax(actions_value) else: action = np.random.randint(0, self.n_actions) return action def learn(self, learning_rate): self.lo.acquire() if not hasattr(self, 'memory_counter'): self.memory_counter = 0 # check to replace target parameters # print(self.learn_step_counter % self.replace_target_iter) if self.learn_step_counter % self.replace_target_iter == 0: try: self.sess.run(self.replace_target_op) print('\ntarget_params_replaced\n') except Exception as e: print('Error on replace the target') traceback.print_exc() # print('here') # sample batch memory from all memory if self.memory_counter > self.memory_size: sample_index = np.random.choice(self.memory_size, size=self.batch_size) else: sample_index = np.random.choice(self.memory_counter, size=self.batch_size) batch_memory = self.memory[sample_index, :] q_next, q_eval = self.sess.run( [self.q_next, self.q_eval], feed_dict={ self.s_: batch_memory[:, -self.n_features:], # fixed params self.s: batch_memory[:, :self.n_features], # newest params }) # change q_target w.r.t q_eval's action q_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) eval_act_index = batch_memory[:, self.n_features].astype(int) reward = batch_memory[:, self.n_features + 1] q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) """ For example in this batch I have 2 samples and 3 actions: q_eval = [[1, 2, 3], [4, 5, 6]] q_target = q_eval = [[1, 2, 3], [4, 5, 6]] Then change q_target with the real q_target value w.r.t the q_eval's action. For example in: sample 0, I took action 0, and the max q_target value is -1; sample 1, I took action 2, and the max q_target value is -2: q_target = [[-1, 2, 3], [4, 5, -2]] So the (q_target - q_eval) becomes: [[(-1)-(1), 0, 0], [0, 0, (-2)-(6)]] We then backpropagate this error w.r.t the corresponding action to network, leave other action as error=0 cause we didn't choose it. """ # train eval network _, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={self.s: batch_memory[:, :self.n_features], self.q_target: q_target, self.lr: learning_rate}) self.cost_his.append(self.cost) self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max self.learn_step_counter += 1 self.lo.release() def plot_cost(self): print('plot') plt.plot(np.arange(len(self.cost_his)), self.cost_his) plt.ylabel('Cost') plt.xlabel('training steps') plt.savefig('cost.png') plt.show() def store(self): saver = tf.train.Saver() saver.save(self.sess, self.save_file) def restore(self): saver = tf.train.Saver() saver.restore(self.sess, self.save_file)