import nltk import unicodedata import numpy as np from loader import CorpusLoader from reader import PickledCorpusReader from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer from sklearn.base import BaseEstimator, TransformerMixin from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline def identity(words): return words class TextNormalizer(BaseEstimator, TransformerMixin): def __init__(self, language='english'): self.stopwords = set(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer() def is_punct(self, token): return all( unicodedata.category(char).startswith('P') for char in token ) def is_stopword(self, token): return token.lower() in self.stopwords def normalize(self, document): return [ self.lemmatize(token, tag).lower() for paragraph in document for sentence in paragraph for (token, tag) in sentence if not self.is_punct(token) and not self.is_stopword(token) ] def lemmatize(self, token, pos_tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(pos_tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag) def fit(self, X, y=None): return self def transform(self, documents): for document in documents: yield self.normalize(document[0]) def create_pipeline(estimator, reduction=False): steps = [ ('normalize', TextNormalizer()), ('vectorize', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False )) ] if reduction: steps.append(( 'reduction', TruncatedSVD(n_components=10000) )) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps) labels = ["books", "cinema", "cooking", "gaming", "sports", "tech"] reader = PickledCorpusReader('../corpus') loader = CorpusLoader(reader, 5, shuffle=True, categories=labels) models = [] for form in (LogisticRegression, SGDClassifier): models.append(create_pipeline(form(), True)) models.append(create_pipeline(form(), False)) models.append(create_pipeline(MultinomialNB(), False)) models.append(create_pipeline(GaussianNB(), True)) import time import json from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score def score_models(models, loader): for model in models: name = model.named_steps['classifier'].__class__.__name__ if 'reduction' in model.named_steps: name += " (TruncatedSVD)" scores = { 'model': str(model), 'name': name, 'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'time': [], } for X_train, X_test, y_train, y_test in loader: start = time.time() model.fit(X_train, y_train) y_pred = model.predict(X_test) scores['time'].append(time.time() - start) scores['accuracy'].append(accuracy_score(y_test, y_pred)) scores['precision'].append(precision_score(y_test, y_pred, average='weighted')) scores['recall'].append(recall_score(y_test, y_pred, average='weighted')) scores['f1'].append(f1_score(y_test, y_pred, average='weighted')) yield scores if __name__ == '__main__': for scores in score_models(models, loader): with open('results.json', 'a') as f: f.write(json.dumps(scores) + "\n")