### Reference https://github.com/rlcode/reinforcement-learning import sys import os import gym import numpy as np from keras.layers import Dense from keras.models import Sequential from keras.optimizers import Adam EPISODES = 1000 os.environ['CUDA_VISIBLE_DEVICES'] = sys.argv[1] # A2C(Advantage Actor-Critic) agent for the Cartpole class A2CAgent: def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size self.value_size = 1 # These are hyper parameters for the Policy Gradient self.discount_factor = 0.99 self.actor_lr = 0.001 self.critic_lr = 0.005 # create model for policy network self.actor = self.build_actor() self.critic = self.build_critic() if self.load_model: self.actor.load_weights("./save_model/cartpole_actor.h5") self.critic.load_weights("./save_model/cartpole_critic.h5") # approximate policy and value using Neural Network # actor: state is input and probability of each action is output of model def build_actor(self): actor = Sequential() actor.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) actor.add(Dense(self.action_size, activation='softmax', kernel_initializer='he_uniform')) actor.summary() # See note regarding crossentropy in cartpole_reinforce.py actor.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.actor_lr)) return actor # critic: state is input and value of state is output of model def build_critic(self): critic = Sequential() critic.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) critic.add(Dense(self.value_size, activation='linear', kernel_initializer='he_uniform')) critic.summary() critic.compile(loss="mse", optimizer=Adam(lr=self.critic_lr)) return critic # using the output of policy network, pick action stochastically def get_action(self, state): policy = self.actor.predict(state, batch_size=1).flatten() return np.random.choice(self.action_size, 1, p=policy)[0] # update policy network every episode def train_model(self, state, action, reward, next_state, done): target = np.zeros((1, self.value_size)) advantages = np.zeros((1, self.action_size)) value = self.critic.predict(state)[0] next_value = self.critic.predict(next_state)[0] if done: advantages[0][action] = reward - value target[0][0] = reward else: advantages[0][action] = reward + self.discount_factor * (next_value) - value target[0][0] = reward + self.discount_factor * next_value self.actor.fit(state, advantages, epochs=1, verbose=0) self.critic.fit(state, target, epochs=1, verbose=0) if __name__ == "__main__": # In case of CartPole-v1, maximum length of episode is 500 env = gym.make('CartPole-v1') # get size of state and action from environment state_size = env.observation_space.shape[0] action_size = env.action_space.n # make A2C agent agent = A2CAgent(state_size, action_size) scores, episodes = [], [] learning_history = [] print(state_size) print(action_size) print("start training") for e in range(EPISODES): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if agent.render: env.render() action = agent.get_action(state) next_state, reward, done, info = env.step(action) next_state = np.reshape(next_state, [1, state_size]) # if an action make the episode end, then gives penalty of -100 reward = reward if not done or score == 499 else -100 agent.train_model(state, action, reward, next_state, done) score += reward state = next_state if done: # every episode, plot the play time score = score if score == 500.0 else score + 100 scores.append(score) episodes.append(e) print("episode:", e, " score:", score) learning_history.append((e,score)) # if the mean of scores of last 10 episode is bigger than 490 # stop training if np.mean(scores[-min(10, len(scores)):]) > 490: sys.exit() # save the model if e % 50 == 0: agent.actor.save_weights("./save_model/cartpole_actor.h5") agent.critic.save_weights("./save_model/cartpole_critic.h5") np.save("a2c_learning_history.npy",learning_history)