Python nltk.stem.WordNetLemmatizer() Examples
The following are 30
code examples of nltk.stem.WordNetLemmatizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem
, or try the search function
.
Example #1
Source File: pre_processing.py From TextLevelGCN with GNU General Public License v3.0 | 7 votes |
def clean_text(text): # stop_words = stopwords.words('english') stop_words = [] stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's']) stemmer = WordNetLemmatizer() text = remove_short(text) text = clean_str(text) text = word_tokenize(text) text = [word for word in text if word not in stop_words] text = [stemmer.lemmatize(word) for word in text] return ' '.join(text)
Example #2
Source File: Article.py From find-all-the-new-words with MIT License | 6 votes |
def real_word(self, word, LEMMATIZATION_flag=True): ''' find the real word ''' p_forword = re.compile('[a-z,A-Z,\',‘]') word_s = p_forword.findall(word) real_word = ''.join(word_s)#.lower() if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']: try: real_word = self.fix_dic[real_word] except Exception as e: logger.debug(e) pass if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']: wordnet_lemmatizer = WordNetLemmatizer() real_word = wordnet_lemmatizer.lemmatize(real_word) logger.debug(word+'-->'+real_word) return real_word
Example #3
Source File: Flair_Model.py From bert-sense with MIT License | 5 votes |
def __init__(self, device_number = 'cuda:2', use_cuda=True): self.device_number = device_number self.use_cuda = use_cuda self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4} self.Flair_Model = Flair(device_number, use_cuda) self.lemmatizer = WordNetLemmatizer()
Example #4
Source File: belford_tfidf.py From yelp with GNU Lesser General Public License v2.1 | 5 votes |
def preprocess( docs, stopwords, min_df=3, min_term_length=2, ngram_range=(1, 1), apply_tfidf=True, apply_norm=True, lemmatize=False): """ Preprocess a list containing text documents stored as strings. """ token_pattern = re.compile(r"\b\w\w+\b", re.U) if lemmatize: from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() def normalize(x): x = x.lower() if lemmatize: return wnl.lemmatize(x) return x def custom_tokenizer(s): return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit # length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer( stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=None, use_idf=apply_tfidf, norm=norm_function, min_df=min_df, ngram_range=ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[v[term]] = term return (X, terms, tfidf)
Example #5
Source File: pdtb2.py From Deep_Enhanced_Repr_for_IDRR with MIT License | 5 votes |
def __lemmatize(self, lemma): """ Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma. """ string, tag = lemma if tag in ('a', 'n', 'r', 'v'): wnl = WordNetLemmatizer() string = wnl.lemmatize(string, tag) return (string, tag) ###################################################################### # POSITIONING.
Example #6
Source File: dictionary.py From find-all-the-new-words with MIT License | 5 votes |
def eudic(word): url = "https://dict.eudic.net/dicts/en/"+word headers = {"authority":"dict.eudic.net", "content-type":"application/x-www-form-urlencoded", "origin": "https://dict.eudic.net", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } r = requests.get(url,headers) soup = BeautifulSoup(r.text,"lxml") try: a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n") except Exception as e: logger.warning(word+" "+str(e)) from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() word = wordnet_lemmatizer.lemmatize(word) url = "https://dict.eudic.net/dicts/en/"+word r = requests.get(url,headers) soup = BeautifulSoup(r.text,"lxml") try: a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n") a = word+"\n\n"+a except Exception as e: logger.warning("fix failed "+word+" "+str(e)) return "" if a[0] == u"赞": a = a[a.index(")")+3:] a = fix_result(a) a = a.replace(".\t\n",".\t") return a
Example #7
Source File: util.py From topic-ensemble with Apache License 2.0 | 5 votes |
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ): """ Preprocess a list containing text documents stored as strings. """ token_pattern = re.compile(r"\b\w\w+\b", re.U) if lemmatize: from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() def normalize( x ): x = x.lower() if lemmatize: return wnl.lemmatize(x) return x def custom_tokenizer( s ): return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[ v[term] ] = term return (X,terms)
Example #8
Source File: preprocessing.py From TextRank with MIT License | 5 votes |
def __init__(self): self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt") self.LEMMATIZER = WordNetLemmatizer() self.STEMMER = SnowballStemmer("english") self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE) self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
Example #9
Source File: BERT_Model.py From bert-sense with MIT License | 5 votes |
def __init__(self, device_number = 'cuda:2', use_cuda=True): self.device_number = device_number self.use_cuda = use_cuda self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4} self.Bert_Model = BERT(device_number, use_cuda) self.lemmatizer = WordNetLemmatizer()
Example #10
Source File: ELMO_Model.py From bert-sense with MIT License | 5 votes |
def __init__(self): self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4} self.Elmo_Model = ELMO() self.lemmatizer = WordNetLemmatizer()
Example #11
Source File: LDAModel_English.py From LDA_RecEngine with Apache License 2.0 | 5 votes |
def __tokenizeWholeCorpora(self,pathToCorpora): print 'Start tokenzing the corpora: %s' % (pathToCorpora) punct = re.compile('[%s]' % re.escape(string.punctuation)) wnl = WordNetLemmatizer() doc_count=0 train_set = [] doc_mapping = {} link_mapping = {} for f in glob(pathToCorpora+'/*'): filereader = open(f, 'r') article = filereader.readlines();filereader.close() text = '' try: link = article[0] title = article[1] text = article[2].lower() except IndexError: continue # Skip document length < min_length if len(text) < self.min_length: continue text = punct.sub("",text) # Remove all punctuations tokens = nltk.word_tokenize(text) # Tokenize the whole text # Lemmatize every word and add to tokens list if the word is not in stopword train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) # Build doc-mapping doc_mapping[doc_count] = title link_mapping[doc_count] = link doc_count = doc_count+1 if doc_count % 10000 == 0: print 'Have processed %i documents' % (doc_count) print 'Finished tokenzing the copora: %s' % (pathToCorpora) return doc_count,train_set,doc_mapping,link_mapping
Example #12
Source File: recipe_cleanup.py From Flavor-Network with GNU General Public License v3.0 | 5 votes |
def split_ingr(x): wnl=WordNetLemmatizer() cleanlist=[] lst = x.strip('[]').split(',') cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst] return cleanlist #remove low-information words from ingredients, could use more
Example #13
Source File: exact_match_indexer.py From claf with MIT License | 5 votes |
def __init__(self, tokenizer, lower=True, lemma=True): super(ExactMatchIndexer, self).__init__(tokenizer) self.param_key = "question" self.lemmatizer = WordNetLemmatizer() self.lower = lower self.lemma = lemma
Example #14
Source File: word_sentence_utils.py From resilient-community-apps with MIT License | 5 votes |
def __init__(self): nltk.download("wordnet", quiet=True) nltk.download("stopwords", quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) self.remove_list = ", . ; ? ~ ! * ) ( { } $ # @ < > ] [".split() self.lem = WordNetLemmatizer()
Example #15
Source File: Chapter 05_KNN n Naive Bayes.py From Statistics-for-Machine-Learning with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #16
Source File: swda.py From deep_disfluency with MIT License | 5 votes |
def __wn_lemmatize(self, lemma): """ Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always returns a (string, pos) pair. Lemmatizes even when the tag isn't helpful, by ignoring it for stemming. """ string, tag = lemma wnl = WordNetLemmatizer() if tag in ('a', 'n', 'r', 'v'): string = wnl.lemmatize(string, tag) else: string = wnl.lemmatize(string) return (string, tag) ######################################################################
Example #17
Source File: nltk_processors.py From forte with Apache License 2.0 | 5 votes |
def __init__(self): super().__init__() self.token_component = None self.lemmatizer = WordNetLemmatizer()
Example #18
Source File: intent_classification.py From voice-enabled-chatbot with MIT License | 5 votes |
def __init__(self): self.lemmatizer = WordNetLemmatizer() self.data = {} self.document = [] self.flat_list = [] self.read_files() """Getting the words from the data""" self.get_words() """Removes the **stop words** like ( ‘off’, ‘is’, ‘s’, ‘am’, ‘or’) and ***non alphabetical*** characters""" self.flat_list = self.remove_stop_words(self.flat_list) """**Lemmatization** i.e., tranforms different forms of words to a single one""" filtered_list = self.lemmatization(self.flat_list) """Getting the ***frequency*** of each word and extracting top 2000""" frequency_distribution = nltk.FreqDist( w.lower() for w in filtered_list ) self.word_features = list(frequency_distribution)[:2000] """Training the model""" self.test_set = nltk.classify.apply_features( self.feature_extraction, self.document[:500] ) self.train_set = nltk.classify.apply_features( self.feature_extraction, self.document[500:] ) self.classifier = nltk.NaiveBayesClassifier.train(self.train_set)
Example #19
Source File: nlp_preprocessing.py From EventForecast with GNU Lesser General Public License v3.0 | 5 votes |
def stem_single_stop(content, stopWords): string = '' wnl = WordNetLemmatizer() for word in content.split(' '): if word == '$': string += 'dollar ' elif len(word) > 1 and (word not in stopWords): word = wnl.lemmatize(word) if word not in stopWords: string = string + word + ' ' return string # split files into batches
Example #20
Source File: 9.5 Skipgram_Keras.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #21
Source File: reader.py From ConvLab with MIT License | 5 votes |
def __init__(self): super().__init__() self.entity_dict = {} self.abbr_dict = {} self.wn = WordNetLemmatizer() self.db = {} self.tokenized_data_path = './data/kvret/' self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
Example #22
Source File: pdtb2.py From pdtb2 with GNU General Public License v2.0 | 5 votes |
def __lemmatize(self, lemma): """ Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma. """ string, tag = lemma if tag in ('a', 'n', 'r', 'v'): wnl = WordNetLemmatizer() string = wnl.lemmatize(string, tag) return (string, tag) ###################################################################### # POSITIONING.
Example #23
Source File: helpers.py From causal-text-embeddings with MIT License | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #24
Source File: helpers.py From causal-text-embeddings with MIT License | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #25
Source File: helpers.py From causal-text-embeddings with MIT License | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #26
Source File: rank_knowledge_for_mc_qa.py From OpenBookQA with Apache License 2.0 | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #27
Source File: swda.py From swda with GNU General Public License v2.0 | 5 votes |
def __wn_lemmatize(self, lemma): """ Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always returns a (string, pos) pair. Lemmatizes even when the tag isn't helpful, by ignoring it for stemming. """ string, tag = lemma wnl = WordNetLemmatizer() if tag in ('a', 'n', 'r', 'v'): string = wnl.lemmatize(string, tag) else: string = wnl.lemmatize(string) return (string, tag)
Example #28
Source File: analyze.py From oie-benchmark with MIT License | 5 votes |
def __init__(self): """ Intialize memebers: question_dist - generalized-question distribution of the assigned extraction location. """ self.question_dist = defaultdict(lambda : defaultdict(lambda : 0)) self.lmtzr = WordNetLemmatizer()
Example #29
Source File: reader.py From SEDST with MIT License | 5 votes |
def __init__(self): super().__init__() self.entity_dict = {} self.abbr_dict = {} self.wn = WordNetLemmatizer() self.tokenized_data_path = './data/kvret/' self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) #self.test = self.train
Example #30
Source File: 9.2 Email_Classification.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() try: tokens = [stemmer.stem(word) for word in tokens] except: tokens = tokens tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text