Python nltk.stem.WordNetLemmatizer() Examples

The following are 30 code examples of nltk.stem.WordNetLemmatizer(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.stem , or try the search function .
Example #1
Source Project: TextLevelGCN   Author: HuangLianzhe   File: pre_processing.py    License: GNU General Public License v3.0 7 votes vote down vote up
def clean_text(text):
        # stop_words = stopwords.words('english')
        stop_words = []
        stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
        stemmer = WordNetLemmatizer()

        text = remove_short(text)
        text = clean_str(text)

        text = word_tokenize(text)

        text = [word for word in text if word not in stop_words]

        text = [stemmer.lemmatize(word) for word in text]

        return ' '.join(text) 
Example #2
Source Project: find-all-the-new-words   Author: Steven-AA   File: Article.py    License: MIT License 6 votes vote down vote up
def real_word(self, word, LEMMATIZATION_flag=True):
        '''
        find the real word
        '''
        p_forword = re.compile('[a-z,A-Z,\',‘]')
        word_s = p_forword.findall(word)
        real_word = ''.join(word_s)#.lower()
        if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']:
            try:
                real_word = self.fix_dic[real_word]
            except Exception as e:
                logger.debug(e)
                pass
        if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']:
            wordnet_lemmatizer = WordNetLemmatizer()
            real_word = wordnet_lemmatizer.lemmatize(real_word)
        logger.debug(word+'-->'+real_word)
        return real_word 
Example #3
Source Project: SEDST   Author: AuCson   File: reader.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
        #self.test = self.train 
Example #4
Source Project: ConvLab   Author: ConvLab   File: reader.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()
        self.db = {}

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) 
Example #5
Source Project: pdtb2   Author: cgpotts   File: pdtb2.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING. 
Example #6
Source Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example #7
Source Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example #8
Source Project: causal-text-embeddings   Author: blei-lab   File: helpers.py    License: MIT License 5 votes vote down vote up
def __init__(self):
		self.wnl = WordNetLemmatizer() 
Example #9
Source Project: OpenBookQA   Author: allenai   File: rank_knowledge_for_mc_qa.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.wnl = WordNetLemmatizer() 
Example #10
Source Project: swda   Author: cgpotts   File: swda.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __wn_lemmatize(self, lemma):
        """
        Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
        returns a (string, pos) pair.  Lemmatizes even when the tag
        isn't helpful, by ignoring it for stemming.
        """
        string, tag = lemma
        wnl = WordNetLemmatizer()
        if tag in ('a', 'n', 'r', 'v'):
            string = wnl.lemmatize(string, tag)
        else:
            string = wnl.lemmatize(string)
        return (string, tag) 
Example #11
Source Project: oie-benchmark   Author: gabrielStanovsky   File: analyze.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        """
        Intialize memebers:
        question_dist - generalized-question distribution of the assigned extraction
                        location.
        """
        self.question_dist = defaultdict(lambda : defaultdict(lambda : 0))
        self.lmtzr = WordNetLemmatizer() 
Example #12
Source Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.5 Skipgram_Keras.py    License: MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example #13
Source Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.2 Email_Classification.py    License: MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example #14
Source Project: TextLevelGCN   Author: HuangLianzhe   File: pre_processing.py    License: GNU General Public License v3.0 5 votes vote down vote up
def stem_corpus():
    stemmer = WordNetLemmatizer()

    with open('data/mr/text_train.txt') as f:
        raw_text = f.read()

    with open('data/mr/label_train.txt') as f:
        raw_labels = f.read()

    labels = []
    for raw_label in raw_labels.split('\n'):
        if raw_label == '1':
            labels.append('pos')
        elif raw_label == '0':
            labels.append('neg')
        else:
            if len(raw_label) == 0:
                continue
            raise ValueError(raw_label)

    corpus = raw_text.split('\n')
    corpus = [clean_str(doc) for doc in corpus]
    corpus = [remove_short(doc) for doc in corpus]

    tokenized_corpus = [word_tokenize(doc) for doc in corpus]

    results = []

    for line in tokenized_corpus:
        results.append(' '.join([stemmer.lemmatize(word) for word in line]))

    results = list(zip(labels, results))
    results = ['\t'.join(line) for line in results]
    random.shuffle(results)

    with open('data/mr/mr-train-stemmed.txt', 'w') as f:
        f.write('\n'.join(results)) 
Example #15
Source Project: EntityDuetNeuralRanking   Author: thunlp   File: cmns.py    License: MIT License 5 votes vote down vote up
def _phrase_stem(cls, phrase):
        wnl = WordNetLemmatizer()
        l_term = phrase.split()
        l_term = [wnl.lemmatize(term, 'n') for term in l_term]
        return ' '.join(l_term) 
Example #16
Source Project: ResumeParser   Author: OmkarPathak   File: utils.py    License: MIT License 5 votes vote down vote up
def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization 
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize  
    filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] 
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)
    
    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)
    
    test = []
    
    for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
        test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))

    # Search the word 'experience' in the chunk and then print out the text after it
    x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
    return x 
Example #17
Source Project: pliers   Author: tyarkoni   File: text.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, stemmer='porter', tokenize=True, case_sensitive=False,
                 *args, **kwargs):
        if isinstance(stemmer, str):
            if stemmer not in self._stemmers:
                valid = list(self._stemmers.keys())
                raise ValueError("Invalid stemmer '%s'; please use one of %s."
                                 % (stemmer, valid))
            stemmer = getattr(stem, self._stemmers[stemmer])(*args, **kwargs)
        elif not isinstance(stemmer, (stem.StemmerI, stem.WordNetLemmatizer)):
            raise ValueError("stemmer must be either a valid string, or an "
                             "instance of class StemmerI.")
        self.stemmer = stemmer
        self.tokenize = tokenize
        self.case_sensitive = case_sensitive
        super().__init__() 
Example #18
Source Project: pliers   Author: tyarkoni   File: text.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _filter(self, stim):
        pos_map = {
            'ADJ': 'a',
            'ADJ_SAT': 's',
            'ADV': 'r',
            'NOUN': 'n',
            'VERB': 'v'
        }

        def pos_wordnet(txt):
            pos_tagged = dict(nltk.pos_tag(txt, tagset='universal'))
            pos_tagged = {t: pos_map[tag] if tag in pos_map else 'n'
                          for t, tag in pos_tagged.items()}
            return pos_tagged

        tokens = [stim.text]
        if self.tokenize:
            tokens = nltk.word_tokenize(tokens[0])
        tokens = [t if self.case_sensitive else t.lower() for t in tokens]
        if not isinstance(self.stemmer, stem.WordNetLemmatizer):
            stemmed = ' '.join([self.stemmer.stem(t) for t in tokens])
        else:
            pos_tagged = pos_wordnet(tokens)
            stemmed = ' '.join([self.stemmer.lemmatize(t, pos=pos_tagged[t])
                                for t in tokens])
        return TextStim(stim.filename, stemmed, stim.onset, stim.duration,
                        stim.order, stim.url) 
Example #19
Source Project: tatk   Author: thu-coai   File: reader.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self):
        super().__init__()

        self.entity_dict = {}
        self.abbr_dict = {}

        self.wn = WordNetLemmatizer()
        self.db = {}

        self.tokenized_data_path = './data/kvret/'
        self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) 
Example #20
Source Project: LDA_RecEngine   Author: easonchan1213   File: LDAModel_English.py    License: Apache License 2.0 5 votes vote down vote up
def __tokenizeWholeCorpora(self,pathToCorpora):
	    print 'Start tokenzing the corpora: %s' % (pathToCorpora)
	    punct = re.compile('[%s]' % re.escape(string.punctuation))
	    wnl = WordNetLemmatizer()
	    doc_count=0
	    train_set = []
	    doc_mapping = {}
	    link_mapping = {}

	    for f in glob(pathToCorpora+'/*'):
	            filereader = open(f, 'r')
	            article = filereader.readlines();filereader.close()
	            text = ''
	            try:
	            	link = article[0]
	            	title = article[1]
	            	text = article[2].lower()
	            except IndexError:
	            	continue

	            # Skip document length < min_length
	            if len(text) < self.min_length:
	                continue
	            text = punct.sub("",text)  # Remove all punctuations
	            tokens = nltk.word_tokenize(text)  # Tokenize the whole text
	            # Lemmatize every word and add to tokens list if the word is not in stopword
	            train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) 
	            # Build doc-mapping
	            doc_mapping[doc_count] = title
	            link_mapping[doc_count] = link
	            doc_count = doc_count+1
	            if doc_count % 10000 == 0:
	            	print 'Have processed %i documents' % (doc_count)

	    print 'Finished tokenzing the copora: %s' % (pathToCorpora)
	    return doc_count,train_set,doc_mapping,link_mapping 
Example #21
Source Project: yelp   Author: melqkiades   File: belford_tfidf.py    License: GNU Lesser General Public License v2.1 5 votes vote down vote up
def preprocess(
        docs, stopwords, min_df=3, min_term_length=2,
        ngram_range=(1, 1), apply_tfidf=True, apply_norm=True,
        lemmatize=False):
    """
    Preprocess a list containing text documents stored as strings.
    """
    token_pattern = re.compile(r"\b\w\w+\b", re.U)

    if lemmatize:
        from nltk.stem import WordNetLemmatizer
        wnl = WordNetLemmatizer()

    def normalize(x):
        x = x.lower()
        if lemmatize:
            return wnl.lemmatize(x)
        return x

    def custom_tokenizer(s):
        return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit
    # length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(
        stop_words=stopwords, lowercase=True, strip_accents="unicode",
        tokenizer=None, use_idf=apply_tfidf, norm=norm_function,
        min_df=min_df, ngram_range=ngram_range)
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[v[term]] = term
    return (X, terms, tfidf) 
Example #22
Source Project: Deep_Enhanced_Repr_for_IDRR   Author: hxbai   File: pdtb2.py    License: MIT License 5 votes vote down vote up
def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING. 
Example #23
Source Project: find-all-the-new-words   Author: Steven-AA   File: dictionary.py    License: MIT License 5 votes vote down vote up
def eudic(word):
    url = "https://dict.eudic.net/dicts/en/"+word
    headers = {"authority":"dict.eudic.net",
    "content-type":"application/x-www-form-urlencoded",
    "origin": "https://dict.eudic.net",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
            }
    r = requests.get(url,headers)
    soup = BeautifulSoup(r.text,"lxml")
    try:
        a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n")
    except Exception as e:
        logger.warning(word+"  "+str(e))
        from nltk.stem import WordNetLemmatizer
        wordnet_lemmatizer = WordNetLemmatizer()
        word = wordnet_lemmatizer.lemmatize(word)
        url = "https://dict.eudic.net/dicts/en/"+word
        r = requests.get(url,headers)
        soup = BeautifulSoup(r.text,"lxml")
        try:
            a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n")
            a = word+"\n\n"+a
        except Exception as e:
            logger.warning("fix failed  "+word+"  "+str(e))
            return ""
    if a[0] == u"赞":
        a = a[a.index(")")+3:]
    a = fix_result(a)
    a = a.replace(".\t\n",".\t")
    return a 
Example #24
Source Project: topic-ensemble   Author: derekgreene   File: util.py    License: Apache License 2.0 5 votes vote down vote up
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ):
	"""
	Preprocess a list containing text documents stored as strings.
	"""
	token_pattern = re.compile(r"\b\w\w+\b", re.U)

	if lemmatize:
		from nltk.stem import WordNetLemmatizer
		wnl = WordNetLemmatizer()

	def normalize( x ):
		x = x.lower()
		if lemmatize:
			return wnl.lemmatize(x)
		return x

	def custom_tokenizer( s ):
		return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

	# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
	if apply_norm:
		norm_function = "l2"
	else:
		norm_function = None
	tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
	X = tfidf.fit_transform(docs)
	terms = []
	# store the vocabulary map
	v = tfidf.vocabulary_
	for i in range(len(v)):
		terms.append("")
	for term in v.keys():
		terms[ v[term] ] = term
	return (X,terms) 
Example #25
Source Project: TextRank   Author: naiveHobo   File: preprocessing.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt")
        self.LEMMATIZER = WordNetLemmatizer()
        self.STEMMER = SnowballStemmer("english")
        self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
        self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
        self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 
Example #26
Source Project: bert-sense   Author: uhh-lt   File: BERT_Model.py    License: MIT License 5 votes vote down vote up
def __init__(self, device_number = 'cuda:2', use_cuda=True):
        
        self.device_number = device_number
        self.use_cuda = use_cuda
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Bert_Model = BERT(device_number, use_cuda)
        self.lemmatizer = WordNetLemmatizer() 
Example #27
Source Project: bert-sense   Author: uhh-lt   File: ELMO_Model.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Elmo_Model = ELMO()
        self.lemmatizer = WordNetLemmatizer() 
Example #28
Source Project: bert-sense   Author: uhh-lt   File: Flair_Model.py    License: MIT License 5 votes vote down vote up
def __init__(self, device_number = 'cuda:2', use_cuda=True):
        
        self.device_number = device_number
        self.use_cuda = use_cuda
        self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4}
        
        self.Flair_Model = Flair(device_number, use_cuda)
        self.lemmatizer = WordNetLemmatizer() 
Example #29
Source Project: Flavor-Network   Author: lingcheng99   File: recipe_cleanup.py    License: GNU General Public License v3.0 5 votes vote down vote up
def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more 
Example #30
Source Project: claf   Author: naver   File: exact_match_indexer.py    License: MIT License 5 votes vote down vote up
def __init__(self, tokenizer, lower=True, lemma=True):
        super(ExactMatchIndexer, self).__init__(tokenizer)

        self.param_key = "question"
        self.lemmatizer = WordNetLemmatizer()

        self.lower = lower
        self.lemma = lemma