import os import string import nltk import re _alphabet = set(string.ascii_lowercase + string.digits + ' ') word_regexp = r"(?u)\b[a-zA-Z_][a-zA-Z_]+\b" class NltkNormalizer: def __init__(self): self.install_nltk_corpora('stopwords', 'wordnet', 'punkt') self.lemmatizer = nltk.WordNetLemmatizer() self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something. self.tokenizer = self.make_tokenizer() self.stopwords = nltk.corpus.stopwords.words('english') self.sent_tokenizer = None @staticmethod def make_tokenizer(): token_pattern = re.compile(word_regexp) return lambda doc: token_pattern.findall(doc) def split_and_normalize(self, o): r = [] for t in self.tokenizer(o): if t not in self.stopwords and len(t) > 2: t = self.lemmatizer.lemmatize(t).lower() t = ''.join([lc for lc in t if lc in _alphabet]) r.append(t) return r def normalize(self, o): return ' '.join(self.split_and_normalize(o)) def sent_tokenize(self, doc): if not self.sent_tokenizer: self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') return self.sent_tokenizer.tokenize(doc) @staticmethod def install_nltk_corpora(*packages): nltk_packages = list(packages) try: installed = (set(os.listdir(nltk.data.find("corpora"))) | (set(os.listdir(nltk.data.find("taggers"))))) | \ (set(os.listdir(nltk.data.find("tokenizers")))) except LookupError: installed = set() if not set(nltk_packages) <= set(installed): nltk.download(nltk_packages)