python source code of aiplayer

from player.player import Player
from othello import Othello
from lib.replaybuffer import ReplayBuffer
from collections import deque, defaultdict
import keras.backend as K
from keras import optimizers
from keras.engine.training import Model
from keras.layers import Dense, Activation, Flatten, Conv2D, BatchNormalization, Add
from keras.engine.topology import Input
from keras.losses import mean_squared_error
from keras.models import load_model
from copy import deepcopy
import numpy as np
from random import random
from time import time
import glob

class AIPlayer(Player):
    
    def __init__(self, buffer_size, sim_count, train=True, model="", tau = 1, compile=False):
        self.buffer = ReplayBuffer(buffer_size)
        self.temp_state = deque()
        self.train = train
        self.loss = 0
        self.acc = 0
        self.batch_count = 0
        self.sim_count = sim_count
        if model != "":
            self.load(model, compile)
        else:
            self.create_network()
        self.tau = tau

    @staticmethod
    def create_if_nonexistant(config):
        models = glob.glob(config.data.model_location + "*.h5")
        if len(models) == 0:
            ai = AIPlayer(config.buffer_size, config.game.simulation_num_per_move)
            ai.save(config.data.model_location+"model_0.h5")
            del ai

    def set_training(self, train):
        self.train = train
    
    @staticmethod
    def clear():
        K.clear_session()
    
    def load(self, file, compile=False):
        try:
            del self.network
        except Exception:
            pass
        self.network = load_model(file, custom_objects={"objective_function_for_policy":AIPlayer.objective_function_for_policy,
                                                        "objective_function_for_value":AIPlayer.objective_function_for_value}, compile=compile)
        
    def save(self, file):
        self.network.save(file)
    
    def create_network(self):
        x_in = Input((3, 8, 8))
        x = Conv2D(filters=128, kernel_size=(3,3), padding="same", data_format="channels_first")(x_in)
        x = BatchNormalization(axis=1)(x)
        x = Activation("relu")(x)
        for _ in range(10):
            x = self._build_residual_block(x)

        res_out = x
        
        x = Conv2D(filters=2, kernel_size=1, data_format="channels_first")(res_out)
        x = BatchNormalization(axis=1)(x)
        x = Activation("relu")(x)
        x = Flatten()(x)
        policy_out = Dense(8*8+1, activation="softmax", name="policy_out")(x)

        x = Conv2D(filters=1, kernel_size=1, data_format="channels_first")(res_out)
        x = BatchNormalization(axis=1)(x)
        x = Activation("relu")(x)
        x = Flatten()(x)
        x = Dense(64, activation="relu")(x)
        value_out =  Dense(1, activation="tanh", name="value_out")(x)
        
        self.network = Model(x_in, [policy_out, value_out], name="reversi_model")
        self.compile()
      
    def _build_residual_block(self, x):
        in_x = x
        x = Conv2D(filters=128, kernel_size=(3,3), padding="same", data_format="channels_first")(x)
        x = BatchNormalization(axis=1)(x)
        x = Activation("relu")(x)
        x = Conv2D(filters=128, kernel_size=(3,3), padding="same", data_format="channels_first")(x)
        x = BatchNormalization(axis=1)(x)
        x = Add()([in_x, x])
        x = Activation("relu")(x)
        return x
        
    def compile(self):
        losses = [AIPlayer.objective_function_for_policy, AIPlayer.objective_function_for_value]
        self.network.compile(optimizer=optimizers.SGD(lr=1e-3, momentum=0.9), loss=losses)
      
    def update_lr(self, lr):
         K.set_value(self.network.optimizer.lr, lr)
        
    @staticmethod
    def objective_function_for_policy(y_true, y_pred):
        # can use categorical_crossentropy??
        return K.sum(-y_true * K.log(y_pred + K.epsilon()), axis=-1)

    @staticmethod
    def objective_function_for_value(y_true, y_pred):
        return mean_squared_error(y_true, y_pred)
        
    def update_buffer(self, winner):
        if self.train:
            while len(self.temp_state) > 0:
                t = self.temp_state.pop()
                self.buffer.add((t[0], t[1], winner))
    
    def train_batches(self, batch_size, batches=-1, verbose=2):
        if batches == -1:
            s_buffer = np.array([_[0] for _ in self.buffer.buffer])
            p_buffer = np.array([_[1] for _ in self.buffer.buffer])
            v_buffer = np.array([_[2] for _ in self.buffer.buffer])
        else:
            sample_size = batch_size*batches
            sample = []
            while sample_size > 0:
                sample += self.buffer.sample(sample_size)
                sample_size -= self.buffer.size()
            s_buffer = np.array([_[0] for _ in sample])
            p_buffer = np.array([_[1] for _ in sample])
            v_buffer = np.array([_[2] for _ in sample])
        history = self.network.fit(s_buffer, [p_buffer, v_buffer], batch_size=batch_size, epochs=1, verbose=verbose)
        return history
    
    def preprocess_input(self, board, side):
        state = np.zeros((3, 8, 8), dtype=np.int)
        for i in range(8):
            for j in range(8):
                if board[i,j] == 1:
                    state[0,i,j] = 1
                elif board[i,j] == -1:
                    state[1,i,j] = 1
                if side == 1:
                    state[2,i,j] = 1
        return state
    
    def evaluate(self, game, side):
        current_input = self.preprocess_input(game.board, side)
        pred = self.network.predict(current_input[np.newaxis,:])
        return pred[1][0]
    
    def pick_move(self, game, side):
        possible_moves = game.possible_moves(side)
        if len(possible_moves) == 0:
            possible_moves.append((-1,-1))
        monte_prob = self.monte_carlo(game, side)
        
        if self.train:
            self.temp_state.append((self.preprocess_input(game.board, side), np.divide(monte_prob, np.sum(monte_prob))))
        
        monte_prob = np.float_power(monte_prob, 1/self.tau)
        monte_prob = np.divide(monte_prob, np.sum(monte_prob))
        
        r = random()
        for i, move in enumerate(possible_moves):
            r -= monte_prob[Othello.move_id(move)]
            if r <= 0:
                return move
        return possible_moves[-1]
            
    def monte_carlo(self, game, side):
        N = defaultdict(lambda: 0)
        W = defaultdict(lambda: 0)
        Q = defaultdict(lambda: 0)
        P = defaultdict(lambda: 0)
        
        
        possible_moves = game.possible_moves(side)
        if len(possible_moves) == 0:
            policy = np.zeros((65))
            policy[64] = 1
            return policy
        elif len(possible_moves) == 1:
            policy = np.zeros((65))
            policy[Othello.move_id(possible_moves[0])] = 1
            return policy
        
        current_input = self.preprocess_input(game.board, side)
        sid = Othello.state_id(game.board)
        pred = self.network.predict(current_input[np.newaxis,:])
        policy = pred[0][0]
        
        total = 1e-10
        for i, move in enumerate(possible_moves):
            total += policy[Othello.move_id(move)]
          
        for move in possible_moves:
            P[(sid, Othello.move_id(move))] = policy[Othello.move_id(move)]/total
        
        for i in range(self.sim_count):
            #print("Sim #%d"% i)
            clone = deepcopy(game)
            current_side = side
            visited = deque()
            while True:
                possible_moves = clone.possible_moves(current_side)
                if len(possible_moves) == 0:
                    possible_moves.append((-1,-1))
                best_move = None
                best_move_value = -2
                sid = Othello.state_id(clone.board)
                for move in possible_moves:
                    mid = Othello.move_id(move)
                    qu_val = Q[(sid, mid)] + P[(sid, mid)]/(N[(sid, mid)]+1)
                    if qu_val > best_move_value:
                        best_move_value = qu_val
                        best_move = move
                
                #print(best_move)
                
                if N[(sid, Othello.move_id(best_move))] == 0:
                    visited.append((sid, Othello.move_id(best_move)))
                    clone.play_move(best_move[0], best_move[1], current_side)
                    current_side *= -1
                    if clone.game_over():
                        for node in visited:
                            N[node] += 1
                            W[node] += clone.get_winner()*side
                            Q[node] = W[node]/N[node]
                        break
                    
                    current_input = self.preprocess_input(clone.board, current_side)
                    sid = Othello.state_id(clone.board)
                    pred = self.network.predict(current_input[np.newaxis,:])
                    policy = pred[0][0]
                    value = pred[1][0]
                    
                    possible_moves = clone.possible_moves(current_side)
                    if len(possible_moves) == 0:
                        possible_moves.append((-1,-1))
                    total = 1e-10
                    for i, move in enumerate(possible_moves):
                        total += policy[Othello.move_id(move)]
                      
                    for move in possible_moves:
                        P[(sid, Othello.move_id(move))] = policy[Othello.move_id(move)]/total
                    
                    for node in visited:
                        N[node] += 1
                        W[node] += value*side
                        Q[node] = W[node]/N[node]
                    #print()
                    break
                else:
                    visited.append((sid, Othello.move_id(best_move)))
                    clone.play_move(best_move[0], best_move[1], current_side)
                    current_side *= -1
                    if clone.game_over():
                        for node in visited:
                            N[node] += 1
                            W[node] += clone.get_winner()*side
                            Q[node] = W[node]/N[node]
                        break
                             
        policy = np.zeros((65))
        possible_moves = game.possible_moves(side)
        sid = Othello.state_id(game.board)
        for move in possible_moves:
            mid = Othello.move_id(move)
            policy[mid] = N[(sid,mid)]
        
        return policy