Python nltk.stem.WordNetLemmatizer() Examples

The following are 30 code examples of nltk.stem.WordNetLemmatizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.stem , or try the search function .
Example #1
Source File: pre_processing.py    From TextLevelGCN with GNU General Public License v3.0 7 votes vote down vote up
def clean_text(text):
        # stop_words = stopwords.words('english')
        stop_words = []
        stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
        stemmer = WordNetLemmatizer()

        text = remove_short(text)
        text = clean_str(text)

        text = word_tokenize(text)

        text = [word for word in text if word not in stop_words]

        text = [stemmer.lemmatize(word) for word in text]

        return ' '.join(text) 
Example #2
Source File: Article.py    From find-all-the-new-words with MIT License 6 votes vote down vote up
def real_word(self, word, LEMMATIZATION_flag=True):
        '''
        find the real word
        '''
        p_forword = re.compile('[a-z,A-Z,\',‘]')
        word_s = p_forword.findall(word)
        real_word = ''.join(word_s)#.lower()
        if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']:
            try:
                real_word = self.fix_dic[real_word]
            except Exception as e:
                logger.debug(e)
                pass
        if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']:
            wordnet_lemmatizer = WordNetLemmatizer()
            real_word = wordnet_lemmatizer.lemmatize(real_word)
        logger.debug(word+'-->'+real_word)
        return real_word 
Example #3
Source File: Flair_Model.py    From bert-sense with MIT License 5 votes vote down vote up
def __init__(self, device_number = 'cuda:2', use_cuda=True):
        
        self.device_number = device_number
        self.use_cuda = use_cuda
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Flair_Model = Flair(device_number, use_cuda)
        self.lemmatizer = WordNetLemmatizer() 
Example #4
Source File: belford_tfidf.py    From yelp with GNU Lesser General Public License v2.1 5 votes vote down vote up
def preprocess(
        docs, stopwords, min_df=3, min_term_length=2,
        ngram_range=(1, 1), apply_tfidf=True, apply_norm=True,
        lemmatize=False):
    """
    Preprocess a list containing text documents stored as strings.
    """
    token_pattern = re.compile(r"\b\w\w+\b", re.U)

    if lemmatize:
        from nltk.stem import WordNetLemmatizer
        wnl = WordNetLemmatizer()

    def normalize(x):
        x = x.lower()
        if lemmatize:
            return wnl.lemmatize(x)
        return x

    def custom_tokenizer(s):
        return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit
    # length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(
        stop_words=stopwords, lowercase=True, strip_accents="unicode",
        tokenizer=None, use_idf=apply_tfidf, norm=norm_function,
        min_df=min_df, ngram_range=ngram_range)
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[v[term]] = term
    return (X, terms, tfidf) 
Example #5
Source File: pdtb2.py    From Deep_Enhanced_Repr_for_IDRR with MIT License 5 votes vote down vote up
def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING. 
Example #6
Source File: dictionary.py    From find-all-the-new-words with MIT License 5 votes vote down vote up
def eudic(word):
    url = "https://dict.eudic.net/dicts/en/"+word
    headers = {"authority":"dict.eudic.net",
    "content-type":"application/x-www-form-urlencoded",
    "origin": "https://dict.eudic.net",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
            }
    r = requests.get(url,headers)
    soup = BeautifulSoup(r.text,"lxml")
    try:
        a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n")
    except Exception as e:
        logger.warning(word+"  "+str(e))
        from nltk.stem import WordNetLemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()
        word = wordnet_lemmatizer.lemmatize(word)
        url = "https://dict.eudic.net/dicts/en/"+word
        r = requests.get(url,headers)
        soup = BeautifulSoup(r.text,"lxml")
        try:
            a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n")
            a = word+"\n\n"+a
        except Exception as e:
            logger.warning("fix failed  "+word+"  "+str(e))
            return ""
    if a[0] == u"赞":
        a = a[a.index(")")+3:]
    a = fix_result(a)
    a = a.replace(".\t\n",".\t")
    return a 
Example #7
Source File: util.py    From topic-ensemble with Apache License 2.0 5 votes vote down vote up
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ):
	"""
	Preprocess a list containing text documents stored as strings.
	"""
	token_pattern = re.compile(r"\b\w\w+\b", re.U)

	if lemmatize:
		from nltk.stem import WordNetLemmatizer
		wnl = WordNetLemmatizer()

	def normalize( x ):
		x = x.lower()
		if lemmatize:
			return wnl.lemmatize(x)
		return x

	def custom_tokenizer( s ):
		return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

	# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
	if apply_norm:
		norm_function = "l2"
	else:
		norm_function = None
	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
	X = tfidf.fit_transform(docs)
	terms = []
	# store the vocabulary map
	v = tfidf.vocabulary_
	for i in range(len(v)):
		terms.append("")
	for term in v.keys():
		terms[ v[term] ] = term
	return (X,terms) 
Example #8
Source File: preprocessing.py    From TextRank with MIT License 5 votes vote down vote up
def __init__(self):
        self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt")
        self.LEMMATIZER = WordNetLemmatizer()
        self.STEMMER = SnowballStemmer("english")
        self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
        self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
        self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 
Example #9
Source File: BERT_Model.py    From bert-sense with MIT License 5 votes vote down vote up
def __init__(self, device_number = 'cuda:2', use_cuda=True):
        
        self.device_number = device_number
        self.use_cuda = use_cuda
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Bert_Model = BERT(device_number, use_cuda)
        self.lemmatizer = WordNetLemmatizer() 
Example #10
Source File: ELMO_Model.py    From bert-sense with MIT License 5 votes vote down vote up
def __init__(self):
        
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Elmo_Model = ELMO()
        self.lemmatizer = WordNetLemmatizer() 
Example #11
Source File: LDAModel_English.py    From LDA_RecEngine with Apache License 2.0 5 votes vote down vote up
def __tokenizeWholeCorpora(self,pathToCorpora):
	    print 'Start tokenzing the corpora: %s' % (pathToCorpora)
	    punct = re.compile('[%s]' % re.escape(string.punctuation))
	    wnl = WordNetLemmatizer()
	    doc_count=0
	    train_set = []
	    doc_mapping = {}
	    link_mapping = {}

	    for f in glob(pathToCorpora+'/*'):
	            filereader = open(f, 'r')
	            article = filereader.readlines();filereader.close()
	            text = ''
	            try:
	            	link = article[0]
	            	title = article[1]
	            	text = article[2].lower()
	            except IndexError:
	            	continue

	            # Skip document length < min_length
	            if len(text) < self.min_length:
	                continue
	            text = punct.sub("",text)  # Remove all punctuations
	            tokens = nltk.word_tokenize(text)  # Tokenize the whole text
	            # Lemmatize every word and add to tokens list if the word is not in stopword
	            train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) 
	            # Build doc-mapping
	            doc_mapping[doc_count] = title
	            link_mapping[doc_count] = link
	            doc_count = doc_count+1
	            if doc_count % 10000 == 0:
	            	print 'Have processed %i documents' % (doc_count)

	    print 'Finished tokenzing the copora: %s' % (pathToCorpora)
	    return doc_count,train_set,doc_mapping,link_mapping 
Example #12
Source File: recipe_cleanup.py    From Flavor-Network with GNU General Public License v3.0 5 votes vote down vote up
def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more 
Example #13
Source File: exact_match_indexer.py    From claf with MIT License 5 votes vote down vote up
def __init__(self, tokenizer, lower=True, lemma=True):
        super(ExactMatchIndexer, self).__init__(tokenizer)

        self.param_key = "question"
        self.lemmatizer = WordNetLemmatizer()

        self.lower = lower
        self.lemma = lemma 
Example #14
Source File: word_sentence_utils.py    From resilient-community-apps with MIT License 5 votes vote down vote up
def __init__(self):
        nltk.download("wordnet", quiet=True)
        nltk.download("stopwords", quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        self.remove_list = ", . ; ? ~ ! * ) ( { } $ # @ < > ] [".split()
        self.lem = WordNetLemmatizer() 
Example #15
Source File: Chapter 05_KNN n Naive Bayes.py    From Statistics-for-Machine-Learning with MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example #16
Source File: swda.py    From deep_disfluency with MIT License 5 votes vote down vote up
def __wn_lemmatize(self, lemma):
        """
        Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
        returns a (string, pos) pair.  Lemmatizes even when the tag
        isn't helpful, by ignoring it for stemming.
        """
        string, tag = lemma
        wnl = WordNetLemmatizer()
        if tag in ('a', 'n', 'r', 'v'):
            string = wnl.lemmatize(string, tag)
        else:
            string = wnl.lemmatize(string)
        return (string, tag)

###################################################################### 
Example #17
Source File: nltk_processors.py    From forte with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        super().__init__()
        self.token_component = None
        self.lemmatizer = WordNetLemmatizer() 
Example #18
Source File: intent_classification.py    From voice-enabled-chatbot with MIT License 5 votes vote down vote up
def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.data = {}
        self.document = []
        self.flat_list = []

        self.read_files()

        """Getting the words from the data"""
        self.get_words()
        """Removes the **stop words** like ( ‘off’, ‘is’, ‘s’, ‘am’, ‘or’) and
               ***non alphabetical*** characters"""
        self.flat_list = self.remove_stop_words(self.flat_list)

        """**Lemmatization** i.e., tranforms different
                forms of words to a single one"""
        filtered_list = self.lemmatization(self.flat_list)

        """Getting the ***frequency*** of each word and extracting top 2000"""

        frequency_distribution = nltk.FreqDist(
            w.lower() for w in filtered_list
        )

        self.word_features = list(frequency_distribution)[:2000]

        """Training the model"""

        self.test_set = nltk.classify.apply_features(
            self.feature_extraction, self.document[:500]
        )
        self.train_set = nltk.classify.apply_features(
            self.feature_extraction, self.document[500:]
        )
        self.classifier = nltk.NaiveBayesClassifier.train(self.train_set) 
Example #19
Source File: nlp_preprocessing.py    From EventForecast with GNU Lesser General Public License v3.0 5 votes vote down vote up
def stem_single_stop(content, stopWords):
    string = ''
    wnl = WordNetLemmatizer()
    for word in content.split(' '):
        if word == '$':
            string += 'dollar '
        elif len(word) > 1 and (word not in stopWords):
            word = wnl.lemmatize(word)
            if word not in stopWords:
                string = string + word + ' '
    return string

# split files into batches 
Example #20
Source File: 9.5 Skipgram_Keras.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example #21
Source File: reader.py    From ConvLab with MIT License 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()
        self.db = {}

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) 
Example #22
Source File: pdtb2.py    From pdtb2 with GNU General Public License v2.0 5 votes vote down vote up
def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING. 
Example #23
Source File: helpers.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example #24
Source File: helpers.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example #25
Source File: helpers.py    From causal-text-embeddings with MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example #26
Source File: rank_knowledge_for_mc_qa.py    From OpenBookQA with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.wnl = WordNetLemmatizer() 
Example #27
Source File: swda.py    From swda with GNU General Public License v2.0 5 votes vote down vote up
def __wn_lemmatize(self, lemma):
        """
        Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
        returns a (string, pos) pair.  Lemmatizes even when the tag
        isn't helpful, by ignoring it for stemming.
        """
        string, tag = lemma
        wnl = WordNetLemmatizer()
        if tag in ('a', 'n', 'r', 'v'):
            string = wnl.lemmatize(string, tag)
        else:
            string = wnl.lemmatize(string)
        return (string, tag) 
Example #28
Source File: analyze.py    From oie-benchmark with MIT License 5 votes vote down vote up
def __init__(self):
        """
        Intialize memebers:
        question_dist - generalized-question distribution of the assigned extraction
                        location.
        """
        self.question_dist = defaultdict(lambda : defaultdict(lambda : 0))
        self.lmtzr = WordNetLemmatizer() 
Example #29
Source File: reader.py    From SEDST with MIT License 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
        #self.test = self.train 
Example #30
Source File: 9.2 Email_Classification.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text