python source code of classifier

# -*- coding: UTF-8 -*-
"""
This module implements the core ML algorithm of gender classification.
"""
from copy import copy
import os
import pickle
from time import time

import numpy as np
import tensorflow as tf
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.contrib.tensorboard.plugins import projector
from tensorflow.python.layers.core import dropout
from tensorflow.contrib.rnn import LSTMCell
from tensorflow.python.client import timeline

from ._batch import BatchGenerator
from .constant import NEGATIVE_CLASS, NEUTRAL_CLASS, POSITIVE_CLASS, CLASS2DEFAULT_CUTOFF
from ._encoder import CharEncoder
from .util import get_logger, set_log_path as _set_log_path

_TRAIN_PROFILE_FILE = 'profile_train.json'
_VALID_PROFILE_FILE = 'profile_valid.json'
_EMBEDDING_METADATA_FILE = 'metadata.tsv'

_LOGGER = get_logger(__name__)

__author__ = 'kensk8er'


def set_log_path(log_path):
    """Set the log path of the logger in classifier module."""
    _set_log_path(_LOGGER, log_path)


class CharLSTM(object):
    """Character-based language modeling using LSTM."""
    _padding_id = 0  # TODO: 0 is used for actual character as well, which is a bit confusing...
    _checkpoint_file_name = 'model.ckpt'
    _instance_file_name = 'instance.pkl'
    _tensorboard_dir = 'tensorboard.log'

    def __init__(self, embedding_size=32, char_rnn_size=128, word_rnn_size=128, learning_rate=0.001,
                 embedding_dropout=0., char_rnn_dropout=0., word_rnn_dropout=0.):
        # hyper-parameters
        self._embedding_size = embedding_size
        self._char_rnn_size = char_rnn_size
        self._word_rnn_size = word_rnn_size
        self._learning_rate = learning_rate
        self._embedding_dropout = embedding_dropout
        self._char_rnn_dropout = char_rnn_dropout
        self._word_rnn_dropout = word_rnn_dropout

        # other instance variables
        self._nodes = None
        self._graph = None
        self._vocab_size = None
        self._encoder = CharEncoder()
        self._num_params = None
        self._session = None

    def train(self, names_train, y_train, names_valid, y_valid, model_path, batch_size=128,
              patience=1024000, stat_interval=1000, valid_interval=1000, summary_interval=1000,
              valid_batch_size=2048, profile=False):
        """Train a gender classifier on the name/gender pairs."""
        start_time = time()

        def add_metric_summaries(mode, iteration, name2metric):
            """Add summary for metric."""
            metric_summary = tf.Summary()
            for name, metric in name2metric.items():
                metric_summary.value.add(tag='{}_{}'.format(mode, name), simple_value=metric)
            summary_writer.add_summary(metric_summary, global_step=iteration)

        def show_train_stats(epoch, iteration, losses, y_cat, y_cat_pred):
            # compute mean statistics
            loss = np.mean(losses)
            accuracy = accuracy_score(y_cat, y_cat_pred)
            score = accuracy - loss

            _LOGGER.info('Epoch={}, Iter={:,}, Mean Training Loss={:.4f}, Accuracy={:.4f}, '
                         'Accuracy - Loss={:.4f}'.format(epoch, iteration, loss, accuracy, score))
            add_metric_summaries('train', iteration, {'cross_entropy': loss, 'accuracy': accuracy,
                                                      'accuracy - loss': score})
            _LOGGER.info('\n{}'.format(classification_report(y_cat, y_cat_pred, digits=3)))
            return list(), list(), list()

        def validate(epoch, iteration, X, y, best_score, patience):
            """Validate the model on validation set."""
            batch_generator = BatchGenerator(X, y, batch_size=valid_batch_size, valid=True)
            losses, y_cat, y_cat_pred = list(), list(), list()
            for X_batch, y_batch in batch_generator:
                X_batch, word_lens, char_lens = self._add_padding(X_batch)
                loss, y_pred = session.run(
                    [nodes['loss'], nodes['y_pred']],
                    feed_dict={nodes['X']: X_batch, nodes['y']: y_batch,
                               nodes['word_lens']: word_lens, nodes['char_lens']: char_lens,
                               nodes['is_train']: False},
                    options=run_options, run_metadata=run_metadata)
                losses.append(loss)
                y_cat.extend(self._categorize_y(y_batch))
                y_cat_pred.extend(self._categorize_y(y_pred))

            # compute mean statistics
            loss = np.mean(losses)
            accuracy = accuracy_score(y_cat, y_cat_pred)
            score = accuracy - loss

            _LOGGER.info('Epoch={}, Iter={:,}, Validation Loss={:.4f}, Accuracy={:.4f}, '
                         'Accuracy - Loss={:.4f}'.format(epoch, iteration, loss, accuracy, score))
            add_metric_summaries('valid', iteration, {'cross_entropy': loss, 'accuracy': accuracy,
                                                      'accuracy - loss': score})
            _LOGGER.info('\n{}'.format(classification_report(y_cat, y_cat_pred, digits=3)))

            if score > best_score:
                _LOGGER.info('Best score (Accuracy - Loss) so far, save the model.')
                self._save(model_path, session)
                best_score = score

                if iteration * 2 > patience:
                    patience = iteration * 2
                    _LOGGER.info('Increased patience to {:,}'.format(patience))

            if run_metadata:
                with open(_VALID_PROFILE_FILE, 'w') as file_:
                    file_.write(
                        timeline.Timeline(run_metadata.step_stats).generate_chrome_trace_format())

            return best_score, patience

        _LOGGER.info('Prepare inputs and other variables for the model...')
        self._fit_encoder(names_train + names_valid)
        X_train = self._encode_chars(names_train)
        X_valid = self._encode_chars(names_valid)
        train_size = len(X_train)
        train_batch_generator = BatchGenerator(X_train, y_train, batch_size)
        best_valid_score = np.float64('-inf')
        losses = list()
        y_cat = list()
        y_cat_pred = list()
        iteration = 0

        # profiler
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) if profile else None
        run_metadata = tf.RunMetadata() if profile else None

        _LOGGER.info('Building the tensorflow graph...')
        self._build_graph()
        nodes = self._nodes
        session = tf.Session(graph=self._graph)
        summary_writer = tf.summary.FileWriter(
            os.path.join(model_path, self._tensorboard_dir), session.graph)
        self._visualize_embedding(model_path, summary_writer)
        session.run(nodes['init'])
        _LOGGER.info('Start fitting a model...')

        # iterate over batches
        for batch_id, (X_batch, y_batch) in enumerate(train_batch_generator):
            epoch = 1 + iteration // train_size

            if batch_id % summary_interval == 0:
                summaries = session.run(nodes['summaries'])
                summary_writer.add_summary(summaries, global_step=iteration)

            X_batch, word_lens, char_lens = self._add_padding(X_batch)

            # Predict labels and update the parameters
            _, loss, y_pred = session.run(
                [nodes['optimizer'], nodes['loss'], nodes['y_pred']],
                feed_dict={nodes['X']: X_batch, nodes['y']: y_batch, nodes['word_lens']: word_lens,
                           nodes['char_lens']: char_lens, nodes['is_train']: True},
                options=run_options, run_metadata=run_metadata)

            losses.append(loss)
            y_cat.extend(self._categorize_y(y_batch))
            y_cat_pred.extend(self._categorize_y(y_pred))
            iteration += batch_size

            if run_metadata:
                with open(_TRAIN_PROFILE_FILE, 'w') as file_:
                    file_.write(
                        timeline.Timeline(run_metadata.step_stats).generate_chrome_trace_format())

            if batch_id % stat_interval == 0:
                losses, y_cat, y_cat_pred = show_train_stats(
                    epoch, iteration, losses, y_cat, y_cat_pred)

            if batch_id % valid_interval == 0:
                best_valid_score, patience = validate(
                    epoch, iteration, X_valid, y_valid, best_valid_score, patience)

            if iteration > patience:
                _LOGGER.info('Iteration is more than patience, finish training.')
                break

        _LOGGER.info('Finished fitting the model.')
        _LOGGER.info('Best Validation Score (Accuracy - Cross-entropy Loss): {:.4f}'
                     .format(best_valid_score))

        # close the session
        session.close()

        end_time = time()
        _LOGGER.info('Took {:,} seconds to train the model.'.format(int(end_time - start_time)))
        return best_valid_score

    @classmethod
    def load(cls, model_path):
        """
        Load the model from the saved model directory.

        :param model_path: path to the model directory you want to load the model from.
        :return: instance of the model
        """
        _LOGGER.debug('Started loading the model...')
        # load the instance, set _model_path appropriately
        with open(os.path.join(model_path, cls._instance_file_name), 'rb') as model_file:
            instance = pickle.load(model_file)

        # build the graph and restore the session
        instance._build_graph()
        instance._session = tf.Session(graph=instance._graph)
        instance._session.run(instance._nodes['init'])
        instance._nodes['saver'].restore(
            instance._session, os.path.join(model_path, instance._checkpoint_file_name))

        _LOGGER.debug('Finished loading the model.')
        return instance

    def predict(self, names: list, return_proba=True, return_attention=False,
                low_cutoff=CLASS2DEFAULT_CUTOFF[NEGATIVE_CLASS],
                high_cutoff=CLASS2DEFAULT_CUTOFF[POSITIVE_CLASS]):
        """
        Predict the genders of given names.

        :param names: list of names
        :param return_proba: output probability if set as True
        :param return_attention: if True, return attentions (weights for each time step)
        """
        nodes = self._nodes
        X = self._encode_chars(names)
        X, word_lens, char_lens = self._add_padding(X)
        y_pred, attentions = self._session.run(
            [nodes['y_pred'], nodes['attentions']],
            feed_dict={nodes['X']: X, nodes['word_lens']: word_lens, nodes['char_lens']: char_lens,
                       nodes['is_train']: False})

        # np.ndarray isn't returned when len(X) == 1
        if not isinstance(y_pred, np.ndarray):
            y_pred = [y_pred]

        if return_proba:
            return_value = [{POSITIVE_CLASS: float(proba), NEGATIVE_CLASS: float(1 - proba)}
                            for proba in y_pred]
        else:
            return_value = self._categorize_y(y_pred, low_cutoff, high_cutoff)

        if return_attention:
            return return_value, attentions.tolist()
        else:
            return return_value

    def _save(self, model_path, session):
        """Save the tensorflow session and the instance object of this Python class."""
        if not os.path.exists(model_path):
            os.makedirs(model_path)

        # save the session
        self._nodes['saver'].save(session, os.path.join(model_path, self._checkpoint_file_name))

        # save the instance
        instance = copy(self)
        instance._graph = None  # _graph is not picklable
        instance._nodes = None  # _nodes is not pciklable
        instance._session = None  # _session is not pciklable
        with open(os.path.join(model_path, self._instance_file_name), 'wb') as pickle_file:
            pickle.dump(instance, pickle_file)

    def _build_graph(self):
        """Build computational graph."""

        def get_num_params():
            """Count the number of trainable parameters."""
            num_params = 0
            for variable in tf.trainable_variables():
                shape = variable.get_shape()
                var_num_params = 1
                for dimension in shape:
                    var_num_params *= dimension.value
                num_params += var_num_params
            return num_params

        _LOGGER.debug('Building a computational graph...')
        graph = tf.Graph()
        nodes = dict()

        with graph.as_default():
            with tf.name_scope('inputs'):
                # inputs
                nodes['X'] = tf.placeholder(tf.int32, [None, None, None], name='X')
                nodes['y'] = tf.placeholder(tf.float32, [None], name='y')
                nodes['word_lens'] = tf.placeholder(tf.int32, [None], name='word_lens')
                nodes['char_lens'] = tf.placeholder(tf.int32, [None], name='char_lens')
                nodes['is_train'] = tf.placeholder(tf.bool, shape=[], name='is_train')

                # get the shape of the input
                X_shape = tf.shape(nodes['X'])
                batch_size = X_shape[0]
                max_word_len = X_shape[1]
                max_char_len = X_shape[2]

            with tf.name_scope('embedding_layer'):
                nodes['embeddings'] = tf.Variable(
                    tf.random_uniform([self._vocab_size, self._embedding_size], -1.0, 1.0),
                    trainable=True, name='embeddings')
                embedded = tf.nn.embedding_lookup(nodes['embeddings'], nodes['X'])
                embedded = dropout(
                    embedded, rate=self._embedding_dropout, training=nodes['is_train'])

            with tf.name_scope('char_rnn_layer') as scope:
                # reshape the embedded matrix in order to pass it to dynamic_rnn
                embedded = tf.reshape(
                    embedded, [batch_size * max_word_len, max_char_len, self._embedding_size])

                char_rnn_fw_cell = LSTMCell(num_units=self._char_rnn_size)
                char_rnn_bw_cell = LSTMCell(num_units=self._char_rnn_size)
                (char_output_fw, char_output_bw), states = tf.nn.bidirectional_dynamic_rnn(
                    char_rnn_fw_cell, char_rnn_bw_cell, embedded, dtype=tf.float32,
                    sequence_length=nodes['char_lens'], scope='{}bidirectional_rnn'.format(scope))

                char_rnn_outputs = tf.concat([char_output_fw, char_output_bw], axis=2)

                with tf.name_scope('char_pooling_layer'):
                    char_rnn_outputs = self._mean_pool(
                        char_rnn_outputs, batch_size, max_char_len, max_word_len,
                        nodes['char_lens'])

                char_rnn_outputs = dropout(
                    char_rnn_outputs, rate=self._char_rnn_dropout, training=nodes['is_train'])

            with tf.name_scope('word_rnn_layer') as scope:
                word_rnn_fw_cell = LSTMCell(num_units=self._word_rnn_size)
                word_rnn_bw_cell = LSTMCell(num_units=self._word_rnn_size)
                (char_output_fw, char_output_bw), states = tf.nn.bidirectional_dynamic_rnn(
                    word_rnn_fw_cell, word_rnn_bw_cell, char_rnn_outputs, dtype=tf.float32,
                    sequence_length=nodes['word_lens'], scope='{}bidirectional_rnn'.format(scope))
                word_rnn_outputs = tf.concat([char_output_fw, char_output_bw], axis=2)

                with tf.name_scope('word_pooling_layer'):
                    word_rnn_outputs, nodes['attentions'] = self._attention_pool(word_rnn_outputs)

                word_rnn_outputs = dropout(
                    word_rnn_outputs, rate=self._word_rnn_dropout, training=nodes['is_train'])

            with tf.variable_scope('softmax_layer'):
                nodes['W_s'] = tf.Variable(
                    tf.random_normal([self._word_rnn_size * 2, 1]), name='weight')
                nodes['b_s'] = tf.Variable(tf.random_normal([1]), name='bias')
                logits = tf.squeeze(tf.matmul(word_rnn_outputs, nodes['W_s']) + nodes['b_s'])
                nodes['y_pred'] = tf.nn.sigmoid(logits)

            with tf.variable_scope('optimizer'):
                nodes['loss'] = tf.reduce_mean(
                    tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=nodes['y']))
                nodes['optimizer'] = tf.train.AdamOptimizer(self._learning_rate).minimize(
                    nodes['loss'])

            # initialize the variables
            nodes['init'] = tf.global_variables_initializer()

            # count the number of parameters
            self._num_params = get_num_params()
            _LOGGER.debug('Total number of parameters = {:,}'.format(self._num_params))

            # generate summaries
            for variable in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
                # having ":" in the name is illegal, so replace to "/"
                tf.summary.histogram(variable.name.replace(':', '/'), variable)
            nodes['summaries'] = tf.summary.merge_all()

            # save the model to checkpoint
            nodes['saver'] = tf.train.Saver()

        self._graph = graph
        self._nodes = nodes

    def _add_padding(self, X):
        """
        Add padding to X in order to align the sequence lengths.

        :param X: list (each name) of list (each word) of character IDs
        :return: padded list of list of character IDs & list of word length before padding & list of
            character length before padding
        """

        def get_max(X):
            """Compute the maximum word length and maximum character length."""
            max_word_len, max_char_len = 0, 0
            for name in X:
                if max_word_len < len(name):
                    max_word_len = len(name)
                for word in name:
                    if max_char_len < len(word):
                        max_char_len = len(word)
            return max_word_len, max_char_len

        max_word_len, max_char_len = get_max(X)
        word_lens = list()
        char_lens = list()

        for name in X:
            word_lens.append(len(name))
            word_pad_len = max_word_len - len(name)
            name.extend([[] for _ in range(word_pad_len)])

            for word in name:
                char_lens.append(len(word))
                char_pad_len = max_char_len - len(word)
                word.extend([self._padding_id for _ in range(char_pad_len)])

        return X, word_lens, char_lens

    def _encode_chars(self, names, fit=False):
        """Encode list of names into list (each name) of list (each word) of character IDs."""
        if fit:
            name_id2word_id2char_ids = self._encoder.fit_encode(names)
            self._vocab_size = self._encoder.vocab_size
        else:
            name_id2word_id2char_ids = self._encoder.encode(names)
        return name_id2word_id2char_ids

    def _fit_encoder(self, names):
        """Fit the encoder to the given list of names."""
        self._encoder.fit(names)
        self._vocab_size = self._encoder.vocab_size

    def _decode_chars(self, name_id2word_id2char_ids):
        """Decode list (each name) of list (each word) of encoded character IDs into characters."""
        return self._encoder.decode(name_id2word_id2char_ids)

    @staticmethod
    def _categorize_y(y, low_cutoff=CLASS2DEFAULT_CUTOFF[NEGATIVE_CLASS],
                      high_cutoff=CLASS2DEFAULT_CUTOFF[POSITIVE_CLASS]):
        """Categorize a list of continuous values y into a list of male/neutral/female labels."""

        def categorize_label(label):
            if label < low_cutoff:
                return NEGATIVE_CLASS
            elif label <= high_cutoff:
                return NEUTRAL_CLASS
            else:
                return POSITIVE_CLASS

        return [categorize_label(label) for label in y]

    def _mean_pool(self, rnn_outputs, batch_size, max_char_len, max_word_len, char_lens):
        """
        Perform mean-pooling after the character-RNN layer.

        :param rnn_outputs: hidden states of all the time steps after the character-RNN layer
        :return: mean of the hidden states over every time step
        """
        # perform mean pooling over characters
        rnn_outputs = tf.reduce_mean(rnn_outputs, reduction_indices=1)

        # In order to avoid 0 padding affect the mean, multiply by `n / m` where `n` is
        # `max_char_len` and `m` is `char_lens`
        rnn_outputs = tf.multiply(rnn_outputs, tf.cast(max_char_len, tf.float32))  # multiply by `n`

        # swap the dimensions in order to divide by an appropriate value for each time step
        rnn_outputs = tf.transpose(rnn_outputs)

        rnn_outputs = tf.divide(rnn_outputs, tf.cast(char_lens, tf.float32))  # divide by `m`
        rnn_outputs = tf.transpose(rnn_outputs)  # shape back to the original shape

        # batch and word-len dimensions were merged before running character-RNN so shape it back
        rnn_outputs = tf.reshape(rnn_outputs, [batch_size, max_word_len, self._char_rnn_size * 2])

        # there are NaN due to padded words (with char_len=0) so convert those NaN to 0
        rnn_outputs = tf.where(tf.is_nan(rnn_outputs), tf.zeros_like(rnn_outputs), rnn_outputs)

        return rnn_outputs

    def _attention_pool(self, rnn_outputs):
        """
        Perform attention-pooling. Train an attention layer to soft search on hidden states to use
        and return weighted sum of the hidden states.

        :param rnn_outputs: hidden states of all the time steps after the word-RNN layer
        :return: weighted sum of the hidden states and attention weights for each time step
        """
        W = tf.Variable(tf.random_normal([2 * self._word_rnn_size]), name='weight_attention')
        b = tf.Variable(tf.random_normal([1]), name='bias_attention')

        # shape: batch_size * word_len
        attentions = tf.reduce_sum(tf.multiply(W, rnn_outputs), reduction_indices=2) + b
        attentions = tf.nn.softmax(attentions)  # convert to probability

        # swap the dimensions in order to multiply by attentions to each word (the 2nd dimension)
        rnn_outputs = tf.transpose(rnn_outputs, perm=[0, 2, 1])

        # expand the dimension in order to multiply outputs by attentions
        attentions = tf.expand_dims(attentions, axis=1)

        rnn_outputs = tf.multiply(attentions, rnn_outputs)
        rnn_outputs = tf.transpose(rnn_outputs, perm=[0, 2, 1])  # shape back to the original shape

        # pool hidden states of multiple words (after applying attention) into one hidden states
        rnn_outputs = tf.reduce_sum(rnn_outputs, reduction_indices=1)

        return rnn_outputs, tf.squeeze(attentions, axis=1)

    def _visualize_embedding(self, model_path, summary_writer):
        """Create metadata file (and its config file) for tensorboard's embedding visualization."""
        metadata_path = os.path.join(model_path, self._tensorboard_dir, _EMBEDDING_METADATA_FILE)

        # create the metadata config file
        config = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = self._nodes['embeddings'].name
        embedding.metadata_path = metadata_path
        projector.visualize_embeddings(summary_writer, config)

        # create metadata file
        with open(metadata_path, 'w', encoding='utf8') as metadata_file:
            metadata_file.write('Character\tID\n')
            for id_, char in enumerate(self._encoder.chars):
                metadata_file.write('{}\t{}\n'.format(char, id_))