import nltk from nltk import wsd from nltk.corpus import wordnet as wn from utils.nltk_normalization import NltkNormalizer class SynsetAnalyzer: def __init__(self): NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger') self.normalizer = NltkNormalizer() self.lem = nltk.WordNetLemmatizer() self.tagger = nltk.PerceptronTagger() self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB} def analyze(self, doc): res = [] for sentence in self.normalizer.sent_tokenize(doc): tagged_sentence = self.tagger.tag(self.normalizer.split_and_normalize(sentence)) lemmatized_doc = [] for w, pos in tagged_sentence: try: pos_ = pos[:1] wn_postag = self.translation_dict[pos_] except KeyError: wn_postag = None if wn_postag: lemmatized_doc.append(self.lem.lemmatize(w, wn_postag)) for w in lemmatized_doc: sense = wsd.lesk(lemmatized_doc, w) if sense: res.append(sense.name()) return res