#!/usr/bin/env python3 import os import nltk import gensim import unicodedata from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from gensim.matutils import sparse2full from gensim.corpora import Dictionary from gensim.models.tfidfmodel import TfidfModel from gensim.sklearn_api import lsimodel, ldamodel class TextNormalizer(BaseEstimator, TransformerMixin): def __init__(self, language='english'): self.stopwords = set(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer() def is_punct(self, token): return all( unicodedata.category(char).startswith('P') for char in token ) def is_stopword(self, token): return token.lower() in self.stopwords def normalize(self, document): return [ self.lemmatize(token, tag).lower() for paragraph in document for sentence in paragraph for (token, tag) in sentence if not self.is_punct(token) and not self.is_stopword(token) ] def lemmatize(self, token, pos_tag): tag = { 'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ }.get(pos_tag[0], wn.NOUN) return self.lemmatizer.lemmatize(token, tag) def fit(self, X, y=None): return self def transform(self, documents): return [ self.normalize(document) for document in documents ] class GensimTfidfVectorizer(BaseEstimator, TransformerMixin): def __init__(self, dirpath=".", tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = os.path.join(dirpath, "corpus.dict") self._tfidf_path = os.path.join(dirpath, "tfidf.model") self.lexicon = None self.tfidf = None self.tofull = tofull self.load() def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path) def save(self): self.lexicon.save(self._lexicon_path) self.tfidf.save(self._tfidf_path) def fit(self, documents, labels=None): self.lexicon = Dictionary(documents) self.tfidf = TfidfModel([self.lexicon.doc2bow(doc) for doc in documents], id2word=self.lexicon) self.save() return self def transform(self, documents): def generator(): for document in documents: vec = self.tfidf[self.lexicon.doc2bow(document)] if self.tofull: yield sparse2full(vec) else: yield vec return list(generator()) if __name__ == '__main__': from reader import PickledCorpusReader corpus = PickledCorpusReader('../corpus') docs = [ list(corpus.docs(fileids=fileid))[0] for fileid in corpus.fileids() ] model = Pipeline([ ('norm', TextNormalizer()), ('vect', GensimTfidfVectorizer()), ('lda', ldamodel.LdaTransformer())]) model.fit_transform(docs) print(model.named_steps['norm'])