# Gridworld environment based on mdp.py # Gridworld provides a basic environment for RL agents to interact with # # --- # @author Yiren Lu # @email luyiren [at] seas [dot] upenn [dot] edu # # MIT License import mdp import env import numpy as np import unittest class GridWorld(mdp.MDP, env.Env): """ Grid world environment """ def __init__(self, grid, terminals, trans_prob=1): """ input: grid 2-d list of the grid including the reward terminals a set of all the terminal states trans_prob transition probability when given a certain action """ self.height = len(grid) self.width = len(grid[0]) self.terminals = terminals self.grid = grid self.neighbors = [(0, 1), (0, -1), (1, 0), (-1, 0), (0, 0)] self.actions = [0, 1, 2, 3, 4] self.dirs = {0: 'r', 1: 'l', 2: 'd', 3: 'u', 4: 's'} # right, left, down, up , stay # self.action_nei = {0: (0,1), 1:(0,-1), 2:(1,0), 3:(-1,0)} # If the mdp is deterministic, the transition probability of taken a certain action should be 1 # otherwise < 1, the rest of the probability are equally spreaded onto # other neighboring states. self.trans_prob = trans_prob def show_grid(self): for i in range(len(self.grid)): print self.grid[i] def get_grid(self): return self.grid def get_states(self): """ returns a list of all states """ return filter( lambda x: self.grid[x[0]][x[1]] != 'x', [(i, j) for i in range(self.height) for j in range(self.width)]) def get_actions(self, state): """ get all the actions that can be takens on the current state returns a list of actions """ if self.grid[state[0]][state[1]] == 'x': return [4] actions = [] for i in range(len(self.actions)-1): inc = self.neighbors[i] a = self.actions[i] nei_s = (state[0] + inc[0], state[1] + inc[1]) if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[1] >= 0 and nei_s[ 1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': actions.append(a) return actions def __get_action_states(self, state): """ get all the actions that can be takens on the current state returns a list of (action, state) pairs """ a_s = [] for i in range(len(self.actions)): inc = self.neighbors[i] a = self.actions[i] nei_s = (state[0] + inc[0], state[1] + inc[1]) if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[1] >= 0 and nei_s[ 1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': a_s.append((a, nei_s)) return a_s def get_reward_sas(self, state, action, state1): """ args state current state action action state1 next state returns the reward on current state """ if not self.grid[state[0]][state[1]] == 'x': return float(self.grid[state[0]][state[1]]) else: return 0 def get_reward(self, state): """ returns the reward on current state """ if not self.grid[state[0]][state[1]] == 'x': return float(self.grid[state[0]][state[1]]) else: return 0 def get_transition_states_and_probs(self, state, action): """ get all the possible transition states and their probabilities with [action] on [state] args state (y, x) action int returns a list of (state, probability) pair """ if self.trans_prob == 1: inc = self.neighbors[action] nei_s = (state[0] + inc[0], state[1] + inc[1]) if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[ 1] >= 0 and nei_s[1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': return [(nei_s, 1)] else: # if the state is invalid, stay in the current state return [(state, 1)] else: action_states = self.__get_action_states(state) inc = self.neighbors[action] nei_s = (state[0] + inc[0], state[1] + inc[1]) res = [] if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[ 1] >= 0 and nei_s[1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': for i in range(len(action_states)): if action_states[i][0] == action: res.append((action_states[i][1], self.trans_prob)) else: res.append( (action_states[i][1], (1 - self.trans_prob) / (len(action_states) - 1))) else: # if the action is not valid, then return uniform distribution of the valid moves. for i in range(len(action_states)): res.append((action_states[i][1], 1.0 / len(action_states))) return res def is_terminal(self, state): """ returns True if the [state] is terminal """ if state in self.terminals: return True else: return False ############################################## # Stateful Functions For Model-Free Leanring # ############################################## def reset(self, start_pos): """ Reset the gridworld for model-free learning. It assumes only 1 agent in the gridworld. args start_pos (i,j) pair of the start location """ self._cur_state = start_pos def get_current_state(self): return self._cur_state def step(self, action): """ Step function for the agent to interact with gridworld args action action taken by the agent returns current_state current state action input action next_state next_state reward reward on the next state is_done True/False - if the episode terminates on the next_state """ if self.is_terminal(self._cur_state): self._is_done = True return self._cur_state, action, self._cur_state, self.get_reward(self._cur_state), True st_prob = self.get_transition_states_and_probs(self._cur_state, action) sampled_idx = np.random.choice(np.arange(0,len(st_prob)), p=[prob for st, prob in st_prob]) last_state = self._cur_state next_state = st_prob[sampled_idx][0] reward = self.get_reward(last_state) self._cur_state = next_state return last_state, action, next_state, reward, False ########################################### # Policy Evaluation for Model-free Agents # ########################################### def get_optimal_policy(self, agent): states = self.get_states() policy = {} for s in states: policy[s] = [(agent.get_optimal_action(s), 1)] return policy def get_values(self, agent): states = self.get_states() values = {} for s in states: values[s] = agent.get_value(s) return values def get_qvalues(self, agent): states = self.get_states() q_values = {} for s in states: for a in self.get_actions(s): q_values[(s,a)] = agent.get_qvalue(s,a) return q_values ############### # For Display # ############### def display_qvalue_grid(self, qvalues): print "==Display q-value grid==" qvalues_grid = np.empty((len(self.grid), len(self.grid[0])), dtype=object) for s in self.get_states(): if self.grid[s[0]][s[1]] == 'x': qvalues_grid[s[0]][s[1]] = '-' else: tmp_str = "" for a in self.get_actions(s): tmp_str = tmp_str + self.dirs[a] tmp_str = tmp_str + str(' {:.2f} '.format(qvalues[(s,a)])) # print tmp_str qvalues_grid[s[0]][s[1]] = tmp_str row_format = '{:>40}' * (len(self.grid[0])) for row in qvalues_grid: print row_format.format(*row) def display_value_grid(self, values): """ Prints a nice table of the values in grid """ print "==Display value grid==" value_grid = np.zeros((len(self.grid), len(self.grid[0]))) for k in values: value_grid[k[0]][k[1]] = float(values[k]) row_format = '{:>20.4}' * (len(self.grid[0])) for row in value_grid: print row_format.format(*row) def display_policy_grid(self, policy): """ prints a nice table of the policy in grid input: policy a dictionary of the optimal policy {<state, action_dist>} """ print "==Display policy grid==" policy_grid = np.chararray((len(self.grid), len(self.grid[0]))) for k in self.get_states(): if self.is_terminal((k[0], k[1])) or self.grid[k[0]][k[1]] == 'x': policy_grid[k[0]][k[1]] = '-' else: # policy_grid[k[0]][k[1]] = self.dirs[agent.get_action((k[0], k[1]))] policy_grid[k[0]][k[1]] = self.dirs[policy[(k[0], k[1])][0][0]] row_format = '{:>20}' * (len(self.grid[0])) for row in policy_grid: print row_format.format(*row)