# -*- coding: utf-8 -*- import cPickle as pickle import numpy as np from scipy.sparse import dok_matrix from mdptoolbox import mdp ACTIONS = 9 STATES = 3**ACTIONS PLAYER = 1 OPPONENT = 2 WINS = ([1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1], [1, 0, 0, 1, 0, 0, 1, 0, 0], [0, 1, 0, 0, 1, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 1, 0, 0, 0, 1], [0, 0, 1, 0, 1, 0, 1, 0, 0]) # The valid number of cells belonging to either the player or the opponent: # (player, opponent) OWNED_CELLS = ((0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (0, 1), (1, 2), (2, 3), (3, 4)) def convertIndexToTuple(state): """""" return(tuple(int(x) for x in np.base_repr(state, 3, 9)[-9::])) def convertTupleToIndex(state): """""" return(int("".join(str(x) for x in state), 3)) def getLegalActions(state): """""" return(tuple(x for x in range(ACTIONS) if state[x] == 0)) def getTransitionAndRewardArrays(): """""" P = [dok_matrix((STATES, STATES)) for a in range(ACTIONS)] #R = spdok((STATES, ACTIONS)) R = np.zeros((STATES, ACTIONS)) # Naive approach, iterate through all possible combinations for a in range(ACTIONS): for s in range(STATES): state = convertIndexToTuple(s) if not isValid(state): # There are no defined moves from an invalid state, so # transition probabilities cannot be calculated. However, # P must be a square stochastic matrix, so assign a # probability of one to the invalid state transitioning # back to itself. P[a][s, s] = 1 # Reward is 0 else: s1, p, r = getTransitionProbabilities(state, a) P[a][s, s1] = p R[s, a] = r P[a] = P[a].tocsr() #R = R.tolil() return(P, R) def getTransitionProbabilities(state, action): """ Parameters ---------- state : tuple The state action : int The action Returns ------- s1, p, r : tuple of two lists and an int s1 are the next states, p are the probabilities, and r is the reward """ #assert isValid(state) assert 0 <= action < ACTIONS if not isLegal(state, action): # If the action is illegal, then transition back to the same state but # incur a high negative reward s1 = [convertTupleToIndex(state)] return(s1, [1], -10) # Update the state with the action state = list(state) state[action] = PLAYER if isWon(state, PLAYER): # If the player's action is a winning move then transition to the # winning state and receive a reward of 1. s1 = [convertTupleToIndex(state)] return(s1, [1], 1) elif isDraw(state): s1 = [convertTupleToIndex(state)] return(s1, [1], 0) # Now we search through the opponents moves, and calculate transition # probabilities based on maximising the opponents chance of winning.. s1 = [] p = [] legal_a = getLegalActions(state) for a in legal_a: state[a] = OPPONENT # If the opponent is going to win, we assume that the winning move will # be chosen: if isWon(state, OPPONENT): s1 = [convertTupleToIndex(state)] return(s1, [1], -1) elif isDraw(state): s1 = [convertTupleToIndex(state)] return(s1, [1], 0) # Otherwise we assume the opponent will select a move with uniform # probability across potential moves: s1.append(convertTupleToIndex(state)) p.append(1.0 / len(legal_a)) state[a] = 0 # During non-terminal play states the reward is 0. return(s1, p, 0) def getReward(state, action): """""" if not isLegal(state, action): return -100 state = list(state) state[action] = PLAYER if isWon(state, PLAYER): return 1 elif isWon(state, OPPONENT): return -1 else: return 0 def isDraw(state): """""" try: state.index(0) return False except ValueError: return True def isLegal(state, action): """""" if state[action] == 0: return True else: return False def isWon(state, who): """Test if a tic-tac-toe game has been won. Assumes that the board is in a legal state. Will test if the value 1 is in any winning combination. """ for w in WINS: S = sum(1 if (w[k] == 1 and state[k] == who) else 0 for k in range(ACTIONS)) if S == 3: # We have a win return True # There were no wins so return False return False def isValid(state): """""" # S1 is the sum of the player's cells S1 = sum(1 if x == PLAYER else 0 for x in state) # S2 is the sum of the opponent's cells S2 = sum(1 if x == OPPONENT else 0 for x in state) if (S1, S2) in OWNED_CELLS: return True else: return False if __name__ == "__main__": P, R = getTransitionAndRewardArrays() ttt = mdp.ValueIteration(P, R, 1) ttt.setVerbose() ttt.run() f = "tictactoe.pkl" pickle.dump(ttt.policy, open(f, "wb")) print("Optimal policy pickled as '%s' in current directory." % f)