Python nltk.stem.wordnet.WordNetLemmatizer() Examples

The following are 30 code examples for showing how to use nltk.stem.wordnet.WordNetLemmatizer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk.stem.wordnet , or try the search function .

Example 1
Project: lexsub   Author: orenmel   File: preprocess_lst_test.py    License: Apache License 2.0 6 votes vote down vote up
def is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets):
    mwe_count = 0
    for synset in synsets:
        gloss_lemmas = set([WordNetLemmatizer().lemmatize(word) for word in synset.definition.split()])
        if verb_lemma in gloss_lemmas or complement_lemma in gloss_lemmas:
            return False
        for syn_lemma in synset.lemmas:
            if syn_lemma.name != mwe: 
                tokens = syn_lemma.name.split('_')
                for token in tokens:
                    if token == verb_lemma:
                        return False
                if len(tokens) == 2 and tokens[1] == complement_lemma:
                    return False
        else:
            mwe_count += syn_lemma.count()
    return True 
Example 2
Project: Sarcasm-Detection   Author: MirunaPislar   File: data_processing.py    License: MIT License 6 votes vote down vote up
def ulterior_clean(tweets, filename):
    if not os.path.exists(filename):
        stopwords = get_stopwords_list()
        lemmatizer = WordNetLemmatizer()
        filtered_tweets = []
        for tw in tweets:
            filtered_tweet = []
            for t in tw.split():
                token = t.lower()
                if token in stopwords:
                    continue
                filtered_token = lemmatizer.lemmatize(token, 'v')
                filtered_token = lemmatizer.lemmatize(filtered_token)
                filtered_tweet.append(filtered_token)
            filtered_tweets.append(' '.join(filtered_tweet))
        utils.save_file(filtered_tweets, filename)
    # Load the filtered tokens
    filtered_tweets = utils.load_file(filename)
    return filtered_tweets 
Example 3
Project: mmfeat   Author: douwekiela   File: imagenet.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, save_dir, config_path='./miner.yaml'):
        super(ImageNetMiner, self).__init__(save_dir, config_path)
        self.__engine__ = 'imagenet'
        self.format_url = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={}'

        # maximum number of synsets to retrieve - we don't need all images necessarily, other-
        # wise we get enormous amounts of synsets for words like 'entity' or 'animal'
        self.max_synsets = 10000

        self.wnl = WordNetLemmatizer()

        # url cache
        self.imgnet_url_cache = {}

        # whether we "level up" in hierarchy if no images found
        self.level_up_if_no_images = True 
Example 4
Project: convai-bot-1337   Author: sld   File: tokenizing.py    License: GNU General Public License v3.0 6 votes vote down vote up
def convert_to_vw(text):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    tokens = [t.lower() for t in tokenizer.tokenize(text)]
    id_ = 13371337
    processed = []
    for t in tokens:
        l = lmtzr.lemmatize(t)
        processed.append(l)
    counted = Counter(processed)
    res_str = str(id_)
    for k, v in counted.items():
        if v != 1:
            res_str = res_str + " {}:{}".format(k, v)
        else:
            res_str = res_str + " {}".format(k)
    return res_str 
Example 5
Project: topics   Author: vladsandulescu   File: predict.py    License: Apache License 2.0 6 votes vote down vote up
def extract_lemmatized_nouns(self, new_review):
        stopwords = self.load_stopwords()
        words = []

        sentences = nltk.sent_tokenize(new_review.lower())
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            text = [word for word in tokens if word not in stopwords]
            tagged_text = nltk.pos_tag(text)

            for word, tag in tagged_text:
                words.append({"word": word, "pos": tag})

        lem = WordNetLemmatizer()
        nouns = []
        for word in words:
            if word["pos"] in ["NN", "NNS"]:
                nouns.append(lem.lemmatize(word["word"]))

        return nouns 
Example 6
Project: product-classifier   Author: georgetown-analytics   File: features.py    License: MIT License 5 votes vote down vote up
def __init__(self, stoplist=None, punct=None, lemmatizer=None):
        # Load stopwords, punctuation, and lemmatizer
        # This takes a bit of work, so we only want to do it once!
        self.stopwords   = stoplist or stopwords.words('english')
        self.punctuation = punct or string.punctuation
        self.lemmatizer  = lemmatizer or WordNetLemmatizer() 
Example 7
Project: Snowball   Author: davidsbatista   File: ReVerb.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self.lmtzr = WordNetLemmatizer()
        self.aux_verbs = ['be'] 
Example 8
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    License: Apache License 2.0 5 votes vote down vote up
def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity 
Example 9
Project: quantified-self   Author: DongjunLee   File: disintegrator.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer() 
Example 10
Project: ZeroShotVideoClassification   Author: bbrattoli   File: auxiliary_word2vec.py    License: Apache License 2.0 5 votes vote down vote up
def verbs2basicform(words):
    ret = []
    for w in words:
        analysis = wn.synsets(w)
        if any([a.pos() == 'v' for a in analysis]):
            w = WordNetLemmatizer().lemmatize(w, 'v')
        ret.append(w)
    return ret 
Example 11
Project: Attention-Based-Aspect-Extraction   Author: madrugado   File: preprocess.py    License: Apache License 2.0 5 votes vote down vote up
def parseSentence(line):
    lmtzr = WordNetLemmatizer()
    stop = stopwords.words('english')
    text_token = CountVectorizer().build_tokenizer()(line.lower())
    text_rmstop = [i for i in text_token if i not in stop]
    text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
    return text_stem 
Example 12
Project: atap   Author: foxbook   File: normalize.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = frozenset(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example 13
Project: atap   Author: foxbook   File: transformers.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example 14
Project: atap   Author: foxbook   File: transformer.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english', minimum=2, maximum=200):
        self.min = minimum
        self.max = maximum
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example 15
Project: atap   Author: foxbook   File: transformers.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example 16
Project: atap   Author: foxbook   File: transformers.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example 17
Project: atap   Author: foxbook   File: transformer.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example 18
def unify_word(word):  # went -> go, apples -> apple, BIG -> big
    """unify verb tense and noun singular"""
    ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    for wt in [ADJ, ADJ_SAT, ADV, NOUN, VERB]:
        try:
            word = WordNetLemmatizer().lemmatize(word, pos=wt)
        except:
            pass
    return word.lower() 
Example 19
Project: collection   Author: skywind3000   File: linguist.py    License: MIT License 5 votes vote down vote up
def lemmatize (self, word, pos = 'n'):
		word = word.lower()
		if self.__lemmatizer is None:
			from nltk.stem.wordnet import WordNetLemmatizer
			self.__lemmatizer = WordNetLemmatizer()
		return self.__lemmatizer.lemmatize(word, pos)


#----------------------------------------------------------------------
# global
#---------------------------------------------------------------------- 
Example 20
Project: lexsub   Author: orenmel   File: cs_inferrer.py    License: Apache License 2.0 5 votes vote down vote up
def generate_inferred(self, result_vec, target_word, target_lemma, pos):
    
        generated_results = {}
        min_weight = None
        if result_vec is not None:
            for word, weight in result_vec:
                if generated_word_re.match(word) != None: # make sure this is not junk
                    wn_pos = to_wordnet_pos[pos]
                    lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                    if word != target_word and lemma != target_lemma:
                        if lemma in generated_results:
                            weight = max(weight, generated_results[lemma])
                        generated_results[lemma] = weight
                        if min_weight is None:
                            min_weight = weight
                        else:
                            min_weight = min(min_weight, weight)
                            
        if min_weight is None:
            min_weight = 0.0
        i = 0.0                
        for lemma in default_generated_results:
            if len(generated_results) >= len(default_generated_results):
                break;
            i -= 1.0
            generated_results[lemma] = min_weight + i
            
                
        return generated_results 
Example 21
Project: lexsub   Author: orenmel   File: cs_inferrer.py    License: Apache License 2.0 5 votes vote down vote up
def filter_inferred(self, result_vec, candidates, pos):
    
        filtered_results = {}
        candidates_found = set()
        
        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results, candidates_found)
                if lemma.title() in candidates:
                    self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
                if word in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results, candidates_found)                    
                if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight, filtered_results, candidates_found)
                    
        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count        
#        candidates_left = candidates - candidates_found
#        for candidate in candidates_left:            
#            count = self.w2counts[candidate] if candidate in self.w2counts else 1
#            score = -1 - (1.0/count) # between (-1,-2] 
#            filtered_results[candidate] = score   
         
        return filtered_results 
Example 22
Project: lexsub   Author: orenmel   File: preprocess_lst_test.py    License: Apache License 2.0 5 votes vote down vote up
def lemmatize(pairs):
    triples = []
    for pair in pairs:
        word = pair[0]
        pos = pair[1]
        wordnet_pos = wordnet.NOUN
        if (len(pos)>=2):
            pos_prefix = pos[:2]
            if (pos_prefix in to_wordnet_pos):
                wordnet_pos = to_wordnet_pos[pos_prefix]
        lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower();
        triples.append([word, wordnet_pos, lemma])
    return triples 
Example 23
Project: lexsub   Author: orenmel   File: preprocess_lst_test.py    License: Apache License 2.0 5 votes vote down vote up
def detect_mwe(text_tokens, target_ind, wordnet_pos):
    if (target_ind < len(text_tokens)-1):
        verb_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind], wordnet_pos)
        complement_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind+1])
        mwe = '_'.join([verb_lemma, complement_lemma])
        synsets = wordnet.synsets(mwe, wordnet.VERB) 
        if len(synsets) > 0:
            if (target_ind+1 < len(text_tokens)-1):
                mwe_right = '_'.join([WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]), WordNetLemmatizer().lemmatize(text_tokens[target_ind+2])])
                if len(wordnet.synsets(mwe_right)) > 0:
                    return
            if is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets) == True:
                mwe = '='.join([text_tokens[target_ind], text_tokens[target_ind+1]])
                text_tokens[target_ind] = mwe
                del text_tokens[target_ind+1] 
Example 24
Project: Sarcasm-Detection   Author: MirunaPislar   File: extract_baseline_features.py    License: MIT License 5 votes vote down vote up
def get_features2(tweets, subj_dict):
    print("Getting features type 2...")
    features = []
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
    lemmatizer = WordNetLemmatizer()
    for tweet in tweets:
        feature_list = [0.0] * 5
        tokens = tknzr.tokenize(tweet)
        # Take the number of positive and negative words as features
        for word in tokens:
            stemmed = lemmatizer.lemmatize(word, 'v')
            stemmed = lemmatizer.lemmatize(stemmed)
            if stemmed in subj_dict:
                dictlist = []
                for word in subj_dict[stemmed]:
                    dictlist.extend(subj_dict[stemmed][word])
                if 'strongsubj' in dictlist:
                    value = 1.0
                else:
                    value = 0.5
                if 'positive' in dictlist:
                    feature_list[0] += value
                elif 'negative' in dictlist:
                    feature_list[1] += value
        # Take the report of positives to negatives as a feature
        if feature_list[0] != 0.0 and feature_list[1] != 0.0:
            feature_list[2] = feature_list[0] / feature_list[1]
        # Derive features from punctuation
        feature_list[2] += count_apparitions(tokens, helper.punctuation)
        # Take strong negations as a feature
        feature_list[3] += count_apparitions(tokens, helper.strong_negations)
        # Take strong affirmatives as a feature
        feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives)
        features.append(feature_list)
    print("Done.")
    return features 
Example 25
Project: Sarcasm-Detection   Author: MirunaPislar   File: data_processing.py    License: MIT License 5 votes vote down vote up
def extract_lemmatized_tweet(tokens, pos, use_verbs=True, use_nouns=True, use_all=False):
    lemmatizer = WordNetLemmatizer()
    clean_data = []
    for index in range(len(tokens)):
        if use_verbs and pos[index] is 'V':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower(), 'v'))
        if use_nouns and pos[index] is 'N':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower()))
        if use_all:
            lemmatized_word = lemmatizer.lemmatize(tokens[index].lower(), 'v')
            word = lemmatizer.lemmatize(lemmatized_word)
            if pos[index] not in ['^', ',', '$', '&', '!', '#', '@']:
                clean_data.append(word)
    return clean_data 
Example 26
Project: python-zpar   Author: EducationalTestingService   File: DepParser.py    License: MIT License 5 votes vote down vote up
def __init__(self, modelpath, libptr, zpar_session_obj):
        super(DepParser, self).__init__()

        # save the zpar session object
        self._zpar_session_obj = zpar_session_obj

        # set up a logger
        self.logger = logging.getLogger(__name__)

        # get the library method that loads the parser models
        self._load_depparser = libptr.load_depparser
        self._load_depparser.restype = c.c_int
        self._load_depparser.argtypes = [c.c_void_p, c.c_char_p]

        # get the library methods that parse sentences and files
        self._dep_parse_sentence = libptr.dep_parse_sentence
        self._dep_parse_sentence.restype = c.c_char_p
        self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]

        self._dep_parse_file = libptr.dep_parse_file
        self._dep_parse_file.restype = None
        self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]

        self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence
        self._dep_parse_tagged_sentence.restype = c.c_char_p
        self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char]

        self._dep_parse_tagged_file = libptr.dep_parse_tagged_file
        self._dep_parse_tagged_file.restype = None
        self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char]

        if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')):
            raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath))

        # set up the wordnet lemmatizer if we have it
        if _HAS_LEMMATIZER:
            self.lemmatizer = WordNetLemmatizer()
        else:
            self.lemmatizer = None 
Example 27
Project: ECDICT   Author: skywind3000   File: linguist.py    License: MIT License 5 votes vote down vote up
def lemmatize (self, word, pos = 'n'):
		word = word.lower()
		if self.__lemmatizer is None:
			from nltk.stem.wordnet import WordNetLemmatizer
			self.__lemmatizer = WordNetLemmatizer()
		return self.__lemmatizer.lemmatize(word, pos)


#----------------------------------------------------------------------
# global
#---------------------------------------------------------------------- 
Example 28
Project: broca   Author: frnsys   File: lemma.py    License: MIT License 5 votes vote down vote up
def __init__(self, n_jobs=1):
        self.lemmr = WordNetLemmatizer()
        self.stops = stopwords.words('english')
        self.n_jobs = n_jobs 
Example 29
Project: broca   Author: frnsys   File: overkill.py    License: MIT License 5 votes vote down vote up
def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]] 
Example 30
Project: props   Author: gabrielStanovsky   File: tree.py    License: MIT License 5 votes vote down vote up
def _VERBAL_PREDICATE_FEATURE_Lemma(self):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        if self.pos in pos_penn_to_wordnet:
            return lmtzr.lemmatize(self.word, pos_penn_to_wordnet[self.pos])
        else:
            return False

    # TODO functions: