Python nltk.WordNetLemmatizer() Examples

The following are code examples for showing how to use nltk.WordNetLemmatizer(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: NL2Type   Author: sola-da   File: preprocess_raw_data.py    MIT License 7 votes vote down vote up
def lemmatize(sentence):
    if (type(sentence) is not str and type(sentence) != unicode) or len(sentence) == 0: return sentence
    words = sentence.split(" ")
    words = [word for word in words if word != '']
    if len(words) == 0: return ""
    word_positions = pos_tag(words)
    lemmatized = []
    for p in word_positions:
        word_pos = get_wordnet_pos(p[1])
        lemmatizer = WordNetLemmatizer()
        try:
            if word_pos != '' and len(word_pos) > 0:
                lemmatized.append(lemmatizer.lemmatize(p[0], pos=word_pos))
            else:
                lemmatized.append(lemmatizer.lemmatize(p[0]))
        except UnicodeDecodeError:
            print "ERROR", word_pos, p[0]
    return " ".join(lemmatized) 
Example 2
Project: ext2rdf   Author: Weissger   File: Lemmatizer.py    GNU General Public License v2.0 6 votes vote down vote up
def lemmatize(self, words):
        """
        Returns the given sentence or part of sentence where each word is lemmatized.
        The lemmatizer is the nltk.WordNetLemmatizer and is is pos-tagged before with
        the default pos-tagger from nltk.pos_tag.
        Note that the pos-tagger may give wrong results if the sentence is to short or is only one word.
        :param words: The words to lemmatize
        :type words: str
        :return:
        """
        tokenized = []
        for (word, tag) in nltk.pos_tag(nltk.word_tokenize(words)):
            tokenized.append(self.__lemmatize(
                word=word,
                pos=self.__get_wordnet_tag(tag)
            ))
        return " ".join(tokenized) 
Example 3
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-2-spam-classification.py    MIT License 5 votes vote down vote up
def preprocess_sentence(sentence):
    lemmatizer = nltk.WordNetLemmatizer()
    # clearly list out our preprocessing pipeline
    processed_tokens = nltk.word_tokenize(sentence)
    processed_tokens = [w.lower() for w in processed_tokens]
    # find least common elements
    word_counts = collections.Counter(processed_tokens)
    uncommon_words = word_counts.most_common()[:-10:-1]
    # remove these tokens
    processed_tokens = [w for w in processed_tokens if w not in stop_words]
    processed_tokens = [w for w in processed_tokens if w not in uncommon_words]
    # lemmatize
    processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens]
    return processed_tokens 
Example 4
Project: TFMTL   Author: felicitywang   File: data_prep.py    Apache License 2.0 5 votes vote down vote up
def wordnet_stemmer(tokens):
    """TODO

    :param tokens:
    :return:
    """
    stemmer = WordNetLemmatizer()
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.lemmatize(item))
    return stemmed


# transform data['text'](string) to ngram model using
# sklearn.feature_extraction.text.TfidfVectorizer 
Example 5
Project: dockerizeme   Author: dockerizeme   File: snippet.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        """
        Instantiates the preprocessor, which make load corpora, models, or do
        other time-intenstive NLTK data loading.
        """
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = set(stopwords) if stopwords else set(sw.words('english'))
        self.punct      = set(punct) if punct else set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer() 
Example 6
Project: MatchZoo-py   Author: NTMC-Community   File: lemmatization.py    Apache License 2.0 5 votes vote down vote up
def transform(self, input_: list) -> list:
        """
        Lemmatization a sequence of tokens.

        :param input_: list of tokens to be lemmatized.

        :return tokens: list of lemmatizd tokens.
        """
        lemmatizer = nltk.WordNetLemmatizer()
        return [lemmatizer.lemmatize(token, pos='v') for token in input_] 
Example 7
Project: partisan-discourse   Author: DistrictDataLabs   File: learn.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, stopwords=None):
        self.stopwords  = set(stopwords or nltk.corpus.stopwords.words('english'))
        self.lemmatizer = nltk.WordNetLemmatizer() 
Example 8
Project: DeepTriage   Author: huazhisong   File: prepocessing_bugs.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def read_lines(file_path):
    # open description file
    with open(file_path, encoding='latin2') as f:
        # remove last 5 lines
        lines_raw = f.readlines()
        # read lines specially
        selected_lines = clean_raw(lines_raw)
        # raw text
        raw_text = ' '.join(selected_lines)
        # decode utf8 coding
        raw_text = raw_text.encode('utf8').decode('utf8')
        # sentences tokinzer
        sentences = nltk.sent_tokenize(raw_text)
        tokens = []
        # dealing words
        wnl = nltk.WordNetLemmatizer()
        english_stopwords = stopwords.words('english')
        for sentence in sentences:
            # cean raw sentence
            sentence = clean_raw_cnn(sentence)
            # words tokenizer
            raw_words = nltk.word_tokenize(sentence)
            # clearn word
            tmp = clean_words(raw_words, wnl, english_stopwords)
            tokens.extend(tmp)

        assert len(tokens) > 0
        line = ' '.join(tokens)

    return line 
Example 9
Project: ext2rdf   Author: Weissger   File: Lemmatizer.py    GNU General Public License v2.0 5 votes vote down vote up
def __init__(self):
        l = nltk.WordNetLemmatizer()
        self.__lemmatize = l.lemmatize 
Example 10
Project: minke   Author: DistrictDataLabs   File: normalize.py    MIT License 5 votes vote down vote up
def __init__(self):
        self._wordnet = nltk.WordNetLemmatizer()
        self._cache   = {} 
Example 11
Project: uncertaintyDetection   Author: PAJEAN   File: formatage_sentences.py    MIT License 5 votes vote down vote up
def formater_phrase(path_file):
	reload(sys)
	sys.setdefaultencoding('utf-8')

	wnl = nltk.WordNetLemmatizer()
	out = open("Data/Inputs/sentences.txt", "w")
	fichier = open(path_file,"r").readlines()
	for ligne in fichier:
		tokens = nltk.word_tokenize(ligne)

		tagged = nltk.pos_tag(tokens)
	
		lemmatisation = []
		for i in tagged:
			if i[1][0] == "J":
				lemmatisation.append(wnl.lemmatize(i[0], nltk.corpus.wordnet.ADJ))
			elif i[1][0] == "V":
				lemmatisation.append(wnl.lemmatize(i[0], nltk.corpus.wordnet.VERB))
			elif i[1][0] == "R":
				lemmatisation.append(wnl.lemmatize(i[0], nltk.corpus.wordnet.ADV))
			else:
				lemmatisation.append(wnl.lemmatize(i[0], nltk.corpus.wordnet.NOUN))
	
		if len(tokens) == len(tagged) and len(tokens) == len(lemmatisation):
			for i in range(len(tokens)):
				out.write(tokens[i]+"\t"+tagged[i][1]+"\t"+lemmatisation[i].lower()+"\n")
		out.write("\n") 
Example 12
Project: RealtimeSentimentAnalysis   Author: zHaytam   File: sentiment_analysis.py    MIT License 5 votes vote down vote up
def __init__(self):
        self.__load_models()
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.sia = SentimentIntensityAnalyzer()
        self.__load_most_used_words() 
Example 13
Project: OpinionMining   Author: prasadpande1990   File: sentimentClassifier.py    GNU General Public License v2.0 5 votes vote down vote up
def lemmatizeWord(word):
    wnl = nltk.WordNetLemmatizer()
    lemma = wnl.lemmatize(word)
    return word 
Example 14
Project: MatchZoo   Author: NTMC-Community   File: lemmatization.py    Apache License 2.0 5 votes vote down vote up
def transform(self, input_: list) -> list:
        """
        Lemmatization a sequence of tokens.

        :param input_: list of tokens to be lemmatized.

        :return tokens: list of lemmatizd tokens.
        """
        lemmatizer = nltk.WordNetLemmatizer()
        return [lemmatizer.lemmatize(token, pos='v') for token in input_] 
Example 15
Project: rejection-qa   Author: becxer   File: text_features.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, require_unique_match, lemmatizer="word_net",
                 empty_question_features=False, stop_words=None):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        self.empty_question_features = empty_question_features
        if lemmatizer == "word_net":
            self._lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError()
        self._cache = {}
        self.require_unique_match = require_unique_match 
Example 16
Project: rejection-qa   Author: becxer   File: text_utils.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {} 
Example 17
Project: pymake   Author: dtrckd   File: vocabulary.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, exclude_stopwords=False, lemmatize=True):

        try:
            import nltk
            _NLTK_DISABLED = False
        except:
            _NLTK_DISABLED = True

        self.vocas = []        # id to word
        self.token2id = dict() # word to id
        self.docfreq = []      # id to document frequency
        self.exclude_stopwords = exclude_stopwords

        stopwords_list = []
        if exclude_stopwords:
            # Too much strict
            #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
            #    stopwords_list = _f.read().replace('\n', '').split()
            if not _NLTK_DISABLED:
                stopwords_list += nltk.corpus.stopwords.words('english')
            stopwords_list = set(stopwords_list)
        self.stopwords_list = stopwords_list

        if lemmatize:
            if not _NLTK_DISABLED:
                self.wlemm = nltk.WordNetLemmatizer()
            else:
                print ('Warning: no lemmatizer !') 
Example 18
Project: Quora   Author: KevinLiao159   File: nlp.py    MIT License 5 votes vote down vote up
def lemmatize(tokens):
    """
    lemmatize tokens
    """
    try:
        wnl = nltk.WordNetLemmatizer()
    except LookupError:
        nltk.download('wordnet')
        wnl = nltk.WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in tokens] 
Example 19
Project: sentiment_analysis   Author: samzek   File: Preprocessing.py    Apache License 2.0 5 votes vote down vote up
def Preprocess(tweet):

    #tokenize
    tokens = nltk.word_tokenize(tweet)
    wnl = nltk.WordNetLemmatizer()

    #puntcation removal
    for i in tokens:
        if i in {u'.',u',',u';',u':',u'!',u'?'}:
            tokens.remove(i)

    #tagging
    tokens = nltk.pos_tag(tokens)

    #stopwords removal
    tokens_no_stop = []

    for t,part in tokens:
        if not t in stopwords.words('english'):
          tokens_no_stop.append((wnl.lemmatize(t),part))


    tokens_stemmed = []

    porter = nltk.PorterStemmer()
    for t,part in tokens_no_stop:
        tokens_stemmed.append((porter.stem(t),part))

    return tokens_no_stop, tokens_stemmed 
Example 20
Project: honours_project   Author: JFriel   File: NLTKPreprocessor.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer() 
Example 21
Project: vec4ir   Author: lgalke   File: nltk_normalization.py    MIT License 5 votes vote down vote up
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None 
Example 22
Project: Quadflor   Author: quadflor   File: nltk_normalization.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None 
Example 23
Project: Quadflor   Author: quadflor   File: synset_analysis.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self):
        NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
        self.normalizer = NltkNormalizer()
        self.lem = nltk.WordNetLemmatizer()
        self.tagger = nltk.PerceptronTagger()
        self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB} 
Example 24
Project: natural-language-preprocessings   Author: Hironsan   File: normalization.py    MIT License 5 votes vote down vote up
def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos()
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos) 
Example 25
Project: cpae   Author: tombosc   File: retrieval.py    MIT License 5 votes vote down vote up
def add_from_lemma_definitions(self, vocab, try_lower=False):
        """Add lemma definitions for non-lemmas.

        This code covers the following scenario: supposed a dictionary is crawled,
        but only for word lemmas.

        """
        lemmatizer = nltk.WordNetLemmatizer()
        added = 0
        for word in vocab.words:
            word_list = [word, word.lower()] if try_lower else [word]

            for word_to_lemma in word_list:
                try:
                    for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                        lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
                        lemma_defs = self._data.get(lemma)
                        if lemma != word and lemma_defs:
                            # This can be quite slow. But this code will not be used
                            # very often.
                            for def_ in lemma_defs:
                                if not def_ in self._data[word]:
                                    added += 1
                                    self._data[word].append(def_)
                except:
                    logger.error("lemmatizer crashed on {}".format(word))
                    logger.error(traceback.format_exc())
        logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
        self.save() 
Example 26
Project: cpae   Author: tombosc   File: retrieval.py    MIT License 5 votes vote down vote up
def crawl_lemmas(self, vocab):
        """Add Wordnet lemmas as definitions."""
        lemmatizer = nltk.WordNetLemmatizer()
        for word in vocab.words:
            definitions = []
            try:
                for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                    lemma = lemmatizer.lemmatize(word, part_of_speech)
                    if lemma != word and not [lemma] in definitions:
                        definitions.append([lemma])
            except:
                logger.error("lemmatizer crashed on {}".format(word))
            if definitions:
                self._data[word] = definitions
        self.save() 
Example 27
Project: primrose   Author: ww-tech   File: minimal_search_engine.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, configuration, instance_name):
        """instantiate the search engine

        Args:
            configuration (Configuration): Configuration instance
            instance_name (str): name of instance

        """
        AbstractSearchEngine.__init__(self, configuration, instance_name)
        self.lemmatizer = WordNetLemmatizer() 
Example 28
Project: Mastering-Machine-Learning-for-Penetration-Testing   Author: PacktPublishing   File: SpamDetection_NLTK.py    MIT License 5 votes vote down vote up
def Process(data):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence,errors='ignore'))] 
Example 29
Project: foodkg.github.io   Author: foodkg   File: parse.py    Apache License 2.0 4 votes vote down vote up
def read_name(ingredient, high_quality):

    if high_quality:
        parts = ingredient.split(",")

        kept = []

        for part in parts:
            count = 0
            tagged = nltk.pos_tag(nltk.word_tokenize(part))
            for tag in tagged:
                if "NN" in tag[1]:
                    count += 1
            if count > 0:
                kept.append(part)

        if len(kept) == 0:
            return ""

        ingredient = kept[0]

        tagged = nltk.pos_tag(nltk.word_tokenize(ingredient))

        # remove anything after a conjunction

        for x in range(len(tagged)):
            if tagged[x][1] == "CC":
                tagged = tagged[0:x]
                break

        tagged = list(
            filter(
                lambda x: (("RB" not in x[1] and x[1] != "JJ" and x[1][0] != "V"))
                or x[0].lower() in webcolors.CSS3_NAMES_TO_HEX,
                tagged,
            )
        )

        words = list(map(lambda x: x[0], tagged))

        p = nltk.PorterStemmer()
        w = nltk.WordNetLemmatizer()

        words = map(lambda x: w.lemmatize(x), words)
        return " ".join(words)
    else:
        return ingredient.split(",")[0]