Python nltk.stem.WordNetLemmatizer() Examples

The following are 30 code examples for showing how to use nltk.stem.WordNetLemmatizer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk.stem , or try the search function .

Example 1
Project: TextLevelGCN   Author: HuangLianzhe   File: pre_processing.py    License: GNU General Public License v3.0 6 votes vote down vote up
def clean_text(text):
        # stop_words = stopwords.words('english')
        stop_words = []
        stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
        stemmer = WordNetLemmatizer()

        text = remove_short(text)
        text = clean_str(text)

        text = word_tokenize(text)

        text = [word for word in text if word not in stop_words]

        text = [stemmer.lemmatize(word) for word in text]

        return ' '.join(text) 
Example 2
Project: find-all-the-new-words   Author: Steven-AA   File: Article.py    License: MIT License 6 votes vote down vote up
def real_word(self, word, LEMMATIZATION_flag=True):
        '''
        find the real word
        '''
        p_forword = re.compile('[a-z,A-Z,\',‘]')
        word_s = p_forword.findall(word)
        real_word = ''.join(word_s)#.lower()
        if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']:
            try:
                real_word = self.fix_dic[real_word]
            except Exception as e:
                logger.debug(e)
                pass
        if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']:
            wordnet_lemmatizer = WordNetLemmatizer()
            real_word = wordnet_lemmatizer.lemmatize(real_word)
        logger.debug(word+'-->'+real_word)
        return real_word 
Example 3
Project: SEDST   Author: AuCson   File: reader.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
        #self.test = self.train 
Example 4
Project: ConvLab   Author: ConvLab   File: reader.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()
        self.db = {}

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) 
Example 5
Project: pdtb2   Author: cgpotts   File: pdtb2.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING. 
Example 6
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example 7
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example 8
Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example 9
Project: OpenBookQA   Author: allenai   File: rank_knowledge_for_mc_qa.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.wnl = WordNetLemmatizer() 
Example 10
Project: swda   Author: cgpotts   File: swda.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __wn_lemmatize(self, lemma):
        """
        Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
        returns a (string, pos) pair.  Lemmatizes even when the tag
        isn't helpful, by ignoring it for stemming.
        """
        string, tag = lemma
        wnl = WordNetLemmatizer()
        if tag in ('a', 'n', 'r', 'v'):
            string = wnl.lemmatize(string, tag)
        else:
            string = wnl.lemmatize(string)
        return (string, tag) 
Example 11
Project: oie-benchmark   Author: gabrielStanovsky   File: analyze.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        """
        Intialize memebers:
        question_dist - generalized-question distribution of the assigned extraction
                        location.
        """
        self.question_dist = defaultdict(lambda : defaultdict(lambda : 0))
        self.lmtzr = WordNetLemmatizer() 
Example 12
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.5 Skipgram_Keras.py    License: MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 13
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 14
Project: TextLevelGCN   Author: HuangLianzhe   File: pre_processing.py    License: GNU General Public License v3.0 5 votes vote down vote up
def stem_corpus():
    stemmer = WordNetLemmatizer()

    with open('data/mr/text_train.txt') as f:
        raw_text = f.read()

    with open('data/mr/label_train.txt') as f:
        raw_labels = f.read()

    labels = []
    for raw_label in raw_labels.split('\n'):
        if raw_label == '1':
            labels.append('pos')
        elif raw_label == '0':
            labels.append('neg')
        else:
            if len(raw_label) == 0:
                continue
            raise ValueError(raw_label)

    corpus = raw_text.split('\n')
    corpus = [clean_str(doc) for doc in corpus]
    corpus = [remove_short(doc) for doc in corpus]

    tokenized_corpus = [word_tokenize(doc) for doc in corpus]

    results = []

    for line in tokenized_corpus:
        results.append(' '.join([stemmer.lemmatize(word) for word in line]))

    results = list(zip(labels, results))
    results = ['\t'.join(line) for line in results]
    random.shuffle(results)

    with open('data/mr/mr-train-stemmed.txt', 'w') as f:
        f.write('\n'.join(results)) 
Example 15
Project: EntityDuetNeuralRanking   Author: thunlp   File: cmns.py    License: MIT License 5 votes vote down vote up
def _phrase_stem(cls, phrase):
        wnl = WordNetLemmatizer()
        l_term = phrase.split()
        l_term = [wnl.lemmatize(term, 'n') for term in l_term]
        return ' '.join(l_term) 
Example 16
Project: ResumeParser   Author: OmkarPathak   File: utils.py    License: MIT License 5 votes vote down vote up
def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization 
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize  
    filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] 
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)
    
    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)
    
    test = []
    
    for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
        test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))

    # Search the word 'experience' in the chunk and then print out the text after it
    x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
    return x 
Example 17
Project: pliers   Author: tyarkoni   File: text.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, stemmer='porter', tokenize=True, case_sensitive=False,
                 *args, **kwargs):
        if isinstance(stemmer, str):
            if stemmer not in self._stemmers:
                valid = list(self._stemmers.keys())
                raise ValueError("Invalid stemmer '%s'; please use one of %s."
                                 % (stemmer, valid))
            stemmer = getattr(stem, self._stemmers[stemmer])(*args, **kwargs)
        elif not isinstance(stemmer, (stem.StemmerI, stem.WordNetLemmatizer)):
            raise ValueError("stemmer must be either a valid string, or an "
                             "instance of class StemmerI.")
        self.stemmer = stemmer
        self.tokenize = tokenize
        self.case_sensitive = case_sensitive
        super().__init__() 
Example 18
Project: pliers   Author: tyarkoni   File: text.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _filter(self, stim):
        pos_map = {
            'ADJ': 'a',
            'ADJ_SAT': 's',
            'ADV': 'r',
            'NOUN': 'n',
            'VERB': 'v'
        }

        def pos_wordnet(txt):
            pos_tagged = dict(nltk.pos_tag(txt, tagset='universal'))
            pos_tagged = {t: pos_map[tag] if tag in pos_map else 'n'
                          for t, tag in pos_tagged.items()}
            return pos_tagged

        tokens = [stim.text]
        if self.tokenize:
            tokens = nltk.word_tokenize(tokens[0])
        tokens = [t if self.case_sensitive else t.lower() for t in tokens]
        if not isinstance(self.stemmer, stem.WordNetLemmatizer):
            stemmed = ' '.join([self.stemmer.stem(t) for t in tokens])
        else:
            pos_tagged = pos_wordnet(tokens)
            stemmed = ' '.join([self.stemmer.lemmatize(t, pos=pos_tagged[t])
                                for t in tokens])
        return TextStim(stim.filename, stemmed, stim.onset, stim.duration,
                        stim.order, stim.url) 
Example 19
Project: tatk   Author: thu-coai   File: reader.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()
        self.db = {}

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) 
Example 20
Project: LDA_RecEngine   Author: easonchan1213   File: LDAModel_English.py    License: Apache License 2.0 5 votes vote down vote up
def __tokenizeWholeCorpora(self,pathToCorpora):
	    print 'Start tokenzing the corpora: %s' % (pathToCorpora)
	    punct = re.compile('[%s]' % re.escape(string.punctuation))
	    wnl = WordNetLemmatizer()
	    doc_count=0
	    train_set = []
	    doc_mapping = {}
	    link_mapping = {}

	    for f in glob(pathToCorpora+'/*'):
	            filereader = open(f, 'r')
	            article = filereader.readlines();filereader.close()
	            text = ''
	            try:
	            	link = article[0]
	            	title = article[1]
	            	text = article[2].lower()
	            except IndexError:
	            	continue

	            # Skip document length < min_length
	            if len(text) < self.min_length:
	                continue
	            text = punct.sub("",text)  # Remove all punctuations
	            tokens = nltk.word_tokenize(text)  # Tokenize the whole text
	            # Lemmatize every word and add to tokens list if the word is not in stopword
	            train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) 
	            # Build doc-mapping
	            doc_mapping[doc_count] = title
	            link_mapping[doc_count] = link
	            doc_count = doc_count+1
	            if doc_count % 10000 == 0:
	            	print 'Have processed %i documents' % (doc_count)

	    print 'Finished tokenzing the copora: %s' % (pathToCorpora)
	    return doc_count,train_set,doc_mapping,link_mapping 
Example 21
Project: yelp   Author: melqkiades   File: belford_tfidf.py    License: GNU Lesser General Public License v2.1 5 votes vote down vote up
def preprocess(
        docs, stopwords, min_df=3, min_term_length=2,
        ngram_range=(1, 1), apply_tfidf=True, apply_norm=True,
        lemmatize=False):
    """
    Preprocess a list containing text documents stored as strings.
    """
    token_pattern = re.compile(r"\b\w\w+\b", re.U)

    if lemmatize:
        from nltk.stem import WordNetLemmatizer
        wnl = WordNetLemmatizer()

    def normalize(x):
        x = x.lower()
        if lemmatize:
            return wnl.lemmatize(x)
        return x

    def custom_tokenizer(s):
        return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit
    # length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(
        stop_words=stopwords, lowercase=True, strip_accents="unicode",
        tokenizer=None, use_idf=apply_tfidf, norm=norm_function,
        min_df=min_df, ngram_range=ngram_range)
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[v[term]] = term
    return (X, terms, tfidf) 
Example 22
Project: Deep_Enhanced_Repr_for_IDRR   Author: hxbai   File: pdtb2.py    License: MIT License 5 votes vote down vote up
def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING. 
Example 23
Project: find-all-the-new-words   Author: Steven-AA   File: dictionary.py    License: MIT License 5 votes vote down vote up
def eudic(word):
    url = "https://dict.eudic.net/dicts/en/"+word
    headers = {"authority":"dict.eudic.net",
    "content-type":"application/x-www-form-urlencoded",
    "origin": "https://dict.eudic.net",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
            }
    r = requests.get(url,headers)
    soup = BeautifulSoup(r.text,"lxml")
    try:
        a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n")
    except Exception as e:
        logger.warning(word+"  "+str(e))
        from nltk.stem import WordNetLemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()
        word = wordnet_lemmatizer.lemmatize(word)
        url = "https://dict.eudic.net/dicts/en/"+word
        r = requests.get(url,headers)
        soup = BeautifulSoup(r.text,"lxml")
        try:
            a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n")
            a = word+"\n\n"+a
        except Exception as e:
            logger.warning("fix failed  "+word+"  "+str(e))
            return ""
    if a[0] == u"赞":
        a = a[a.index(")")+3:]
    a = fix_result(a)
    a = a.replace(".\t\n",".\t")
    return a 
Example 24
Project: topic-ensemble   Author: derekgreene   File: util.py    License: Apache License 2.0 5 votes vote down vote up
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ):
	"""
	Preprocess a list containing text documents stored as strings.
	"""
	token_pattern = re.compile(r"\b\w\w+\b", re.U)

	if lemmatize:
		from nltk.stem import WordNetLemmatizer
		wnl = WordNetLemmatizer()

	def normalize( x ):
		x = x.lower()
		if lemmatize:
			return wnl.lemmatize(x)
		return x

	def custom_tokenizer( s ):
		return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

	# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
	if apply_norm:
		norm_function = "l2"
	else:
		norm_function = None
	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
	X = tfidf.fit_transform(docs)
	terms = []
	# store the vocabulary map
	v = tfidf.vocabulary_
	for i in range(len(v)):
		terms.append("")
	for term in v.keys():
		terms[ v[term] ] = term
	return (X,terms) 
Example 25
Project: TextRank   Author: naiveHobo   File: preprocessing.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt")
        self.LEMMATIZER = WordNetLemmatizer()
        self.STEMMER = SnowballStemmer("english")
        self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
        self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
        self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 
Example 26
Project: bert-sense   Author: uhh-lt   File: BERT_Model.py    License: MIT License 5 votes vote down vote up
def __init__(self, device_number = 'cuda:2', use_cuda=True):
        
        self.device_number = device_number
        self.use_cuda = use_cuda
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Bert_Model = BERT(device_number, use_cuda)
        self.lemmatizer = WordNetLemmatizer() 
Example 27
Project: bert-sense   Author: uhh-lt   File: ELMO_Model.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Elmo_Model = ELMO()
        self.lemmatizer = WordNetLemmatizer() 
Example 28
Project: bert-sense   Author: uhh-lt   File: Flair_Model.py    License: MIT License 5 votes vote down vote up
def __init__(self, device_number = 'cuda:2', use_cuda=True):
        
        self.device_number = device_number
        self.use_cuda = use_cuda
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Flair_Model = Flair(device_number, use_cuda)
        self.lemmatizer = WordNetLemmatizer() 
Example 29
Project: Flavor-Network   Author: lingcheng99   File: recipe_cleanup.py    License: GNU General Public License v3.0 5 votes vote down vote up
def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more 
Example 30
Project: claf   Author: naver   File: exact_match_indexer.py    License: MIT License 5 votes vote down vote up
def __init__(self, tokenizer, lower=True, lemma=True):
        super(ExactMatchIndexer, self).__init__(tokenizer)

        self.param_key = "question"
        self.lemmatizer = WordNetLemmatizer()

        self.lower = lower
        self.lemma = lemma