Python nltk.WordNetLemmatizer() Examples

The following are 15 code examples for showing how to use nltk.WordNetLemmatizer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: Quora   Author: KevinLiao159   File: nlp.py    License: MIT License 6 votes vote down vote up
def lemmatize(tokens):
    """
    lemmatize tokens
    """
    try:
        wnl = nltk.WordNetLemmatizer()
    except LookupError:
        nltk.download('wordnet')
        wnl = nltk.WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in tokens] 
Example 2
Project: normalise   Author: EFord36   File: detect.py    License: GNU General Public License v3.0 6 votes vote down vote up
def cond2(w):
    """ Return word if its lemmatised form is not in the wordlist."""
    wnl = WordNetLemmatizer()
    return wnl.lemmatize(w.lower()) not in wordlist 
Example 3
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-2-spam-classification.py    License: MIT License 5 votes vote down vote up
def preprocess_sentence(sentence):
    lemmatizer = nltk.WordNetLemmatizer()
    # clearly list out our preprocessing pipeline
    processed_tokens = nltk.word_tokenize(sentence)
    processed_tokens = [w.lower() for w in processed_tokens]
    # find least common elements
    word_counts = collections.Counter(processed_tokens)
    uncommon_words = word_counts.most_common()[:-10:-1]
    # remove these tokens
    processed_tokens = [w for w in processed_tokens if w not in stop_words]
    processed_tokens = [w for w in processed_tokens if w not in uncommon_words]
    # lemmatize
    processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens]
    return processed_tokens 
Example 4
Project: MatchZoo-py   Author: NTMC-Community   File: lemmatization.py    License: Apache License 2.0 5 votes vote down vote up
def transform(self, input_: list) -> list:
        """
        Lemmatization a sequence of tokens.

        :param input_: list of tokens to be lemmatized.

        :return tokens: list of lemmatizd tokens.
        """
        lemmatizer = nltk.WordNetLemmatizer()
        return [lemmatizer.lemmatize(token, pos='v') for token in input_] 
Example 5
Project: partisan-discourse   Author: DistrictDataLabs   File: learn.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, stopwords=None):
        self.stopwords  = set(stopwords or nltk.corpus.stopwords.words('english'))
        self.lemmatizer = nltk.WordNetLemmatizer() 
Example 6
Project: MatchZoo   Author: NTMC-Community   File: lemmatization.py    License: Apache License 2.0 5 votes vote down vote up
def transform(self, input_: list) -> list:
        """
        Lemmatization a sequence of tokens.

        :param input_: list of tokens to be lemmatized.

        :return tokens: list of lemmatizd tokens.
        """
        lemmatizer = nltk.WordNetLemmatizer()
        return [lemmatizer.lemmatize(token, pos='v') for token in input_] 
Example 7
Project: vec4ir   Author: lgalke   File: nltk_normalization.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None 
Example 8
Project: Quadflor   Author: quadflor   File: nltk_normalization.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None 
Example 9
Project: Quadflor   Author: quadflor   File: synset_analysis.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self):
        NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
        self.normalizer = NltkNormalizer()
        self.lem = nltk.WordNetLemmatizer()
        self.tagger = nltk.PerceptronTagger()
        self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB} 
Example 10
Project: natural-language-preprocessings   Author: Hironsan   File: normalization.py    License: MIT License 5 votes vote down vote up
def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos()
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos) 
Example 11
Project: cpae   Author: tombosc   File: retrieval.py    License: MIT License 5 votes vote down vote up
def add_from_lemma_definitions(self, vocab, try_lower=False):
        """Add lemma definitions for non-lemmas.

        This code covers the following scenario: supposed a dictionary is crawled,
        but only for word lemmas.

        """
        lemmatizer = nltk.WordNetLemmatizer()
        added = 0
        for word in vocab.words:
            word_list = [word, word.lower()] if try_lower else [word]

            for word_to_lemma in word_list:
                try:
                    for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                        lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
                        lemma_defs = self._data.get(lemma)
                        if lemma != word and lemma_defs:
                            # This can be quite slow. But this code will not be used
                            # very often.
                            for def_ in lemma_defs:
                                if not def_ in self._data[word]:
                                    added += 1
                                    self._data[word].append(def_)
                except:
                    logger.error("lemmatizer crashed on {}".format(word))
                    logger.error(traceback.format_exc())
        logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
        self.save() 
Example 12
Project: cpae   Author: tombosc   File: retrieval.py    License: MIT License 5 votes vote down vote up
def crawl_lemmas(self, vocab):
        """Add Wordnet lemmas as definitions."""
        lemmatizer = nltk.WordNetLemmatizer()
        for word in vocab.words:
            definitions = []
            try:
                for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                    lemma = lemmatizer.lemmatize(word, part_of_speech)
                    if lemma != word and not [lemma] in definitions:
                        definitions.append([lemma])
            except:
                logger.error("lemmatizer crashed on {}".format(word))
            if definitions:
                self._data[word] = definitions
        self.save() 
Example 13
Project: Mastering-Machine-Learning-for-Penetration-Testing   Author: PacktPublishing   File: SpamDetection_NLTK.py    License: MIT License 5 votes vote down vote up
def Process(data):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence,errors='ignore'))] 
Example 14
Project: document-qa   Author: allenai   File: text_features.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, require_unique_match, lemmatizer="word_net",
                 empty_question_features=False, stop_words=None):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        self.empty_question_features = empty_question_features
        if lemmatizer == "word_net":
            self._lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError()
        self._cache = {}
        self.require_unique_match = require_unique_match 
Example 15
Project: document-qa   Author: allenai   File: text_utils.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {}