#!/usr/bin/env python3
Performs entity extraction using NLTK
import os
import time
import json

from nltk import ne_chunk
from nltk.chunk import tree2conlltags

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# GPE is Geo-Political Entity, GSP is Geo-Socio-Political group

def identity(words):
    return words

class EntityExtractor(BaseEstimator, TransformerMixin):
    Perform entity extraction

    Output is saved
    def __init__(self, labels=GOODLABELS, **kwargs):
        self.labels = labels

    def get_entities(self, document):
        Extract entities from a single document using the
        nltk.tree.ne_chunk method

        This method is called multiple times by the tranform method

        :param document: a list of lists of tuples
        :return entities: a list of comma-separated strings
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                # classifier chunk the sentences, adds category labels, e.g. PERSON
                trees = ne_chunk(sentence)
                # select only trees with the kinds of entities we want
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            # entities is a list, each entry is a list of entities
                            # for a document
                                ' '.join([child[0].lower() for child in tree])
        return entities

    def fit(self, documents, labels=None):
        return self

    def transform(self, documents):
        Create a representation of the documents as a list of their entities
        for document in documents:
            yield self.get_entities(document[0])

def create_pipeline(estimator):
    steps = [
        ('extract', EntityExtractor()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ('classifier', estimator)

    return Pipeline(steps)

def score_models(models, loader):
    for model in models:

        name = model.named_steps['classifier'].__class__.__name__
        scores = {
            'model': str(model),
            'name': name,
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'time': [],

        for X_train, X_test, y_train, y_test in loader:
            start = time.time()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            scores['time'].append(time.time() - start)
            scores['accuracy'].append(accuracy_score(y_test, y_pred))
            scores['precision'].append(precision_score(y_test, y_pred, average='weighted'))
            scores['recall'].append(recall_score(y_test, y_pred, average='weighted'))
            scores['f1'].append(f1_score(y_test, y_pred, average='weighted'))

        yield scores

if __name__ == '__main__':
    from loader import CorpusLoader
    from reader import PickledCorpusReader

    reader = PickledCorpusReader('../corpus')
    # extractor = EntityExtractor()
    # print(list(extractor.fit_transform(reader.docs())))

    labels = ['books', 'cinema', 'cooking', 'gaming', 'sports', 'tech']
    loader = CorpusLoader(reader, 5, shuffle=True, categories=labels)

    forms = (LogisticRegression, SGDClassifier, MultinomialNB, GaussianNB)
    models = []
    for form in forms:

    for scores in score_models(models, loader):
        with open('results_with_ner.json', 'a') as f:
            f.write(json.dumps(scores) + "\n")