Python nltk.stem.WordNetLemmatizer() Examples
The following are 30
code examples of nltk.stem.WordNetLemmatizer().
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem
, or try the search function
.

Example #1
Source Project: TextLevelGCN Author: HuangLianzhe File: pre_processing.py License: GNU General Public License v3.0 | 7 votes |
def clean_text(text): # stop_words = stopwords.words('english') stop_words = [] stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's']) stemmer = WordNetLemmatizer() text = remove_short(text) text = clean_str(text) text = word_tokenize(text) text = [word for word in text if word not in stop_words] text = [stemmer.lemmatize(word) for word in text] return ' '.join(text)
Example #2
Source Project: find-all-the-new-words Author: Steven-AA File: Article.py License: MIT License | 6 votes |
def real_word(self, word, LEMMATIZATION_flag=True): ''' find the real word ''' p_forword = re.compile('[a-z,A-Z,\',‘]') word_s = p_forword.findall(word) real_word = ''.join(word_s)#.lower() if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']: try: real_word = self.fix_dic[real_word] except Exception as e: logger.debug(e) pass if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']: wordnet_lemmatizer = WordNetLemmatizer() real_word = wordnet_lemmatizer.lemmatize(real_word) logger.debug(word+'-->'+real_word) return real_word
Example #3
Source Project: SEDST Author: AuCson File: reader.py License: MIT License | 5 votes |
def __init__(self): super().__init__() self.entity_dict = {} self.abbr_dict = {} self.wn = WordNetLemmatizer() self.tokenized_data_path = './data/kvret/' self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity) #self.test = self.train
Example #4
Source Project: ConvLab Author: ConvLab File: reader.py License: MIT License | 5 votes |
def __init__(self): super().__init__() self.entity_dict = {} self.abbr_dict = {} self.wn = WordNetLemmatizer() self.db = {} self.tokenized_data_path = './data/kvret/' self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
Example #5
Source Project: pdtb2 Author: cgpotts File: pdtb2.py License: GNU General Public License v2.0 | 5 votes |
def __lemmatize(self, lemma): """ Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma. """ string, tag = lemma if tag in ('a', 'n', 'r', 'v'): wnl = WordNetLemmatizer() string = wnl.lemmatize(string, tag) return (string, tag) ###################################################################### # POSITIONING.
Example #6
Source Project: causal-text-embeddings Author: blei-lab File: helpers.py License: MIT License | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #7
Source Project: causal-text-embeddings Author: blei-lab File: helpers.py License: MIT License | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #8
Source Project: causal-text-embeddings Author: blei-lab File: helpers.py License: MIT License | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #9
Source Project: OpenBookQA Author: allenai File: rank_knowledge_for_mc_qa.py License: Apache License 2.0 | 5 votes |
def __init__(self): self.wnl = WordNetLemmatizer()
Example #10
Source Project: swda Author: cgpotts File: swda.py License: GNU General Public License v2.0 | 5 votes |
def __wn_lemmatize(self, lemma): """ Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always returns a (string, pos) pair. Lemmatizes even when the tag isn't helpful, by ignoring it for stemming. """ string, tag = lemma wnl = WordNetLemmatizer() if tag in ('a', 'n', 'r', 'v'): string = wnl.lemmatize(string, tag) else: string = wnl.lemmatize(string) return (string, tag)
Example #11
Source Project: oie-benchmark Author: gabrielStanovsky File: analyze.py License: MIT License | 5 votes |
def __init__(self): """ Intialize memebers: question_dist - generalized-question distribution of the assigned extraction location. """ self.question_dist = defaultdict(lambda : defaultdict(lambda : 0)) self.lmtzr = WordNetLemmatizer()
Example #12
Source Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: 9.5 Skipgram_Keras.py License: MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #13
Source Project: Natural-Language-Processing-with-Python-Cookbook Author: PacktPublishing File: 9.2 Email_Classification.py License: MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() try: tokens = [stemmer.stem(word) for word in tokens] except: tokens = tokens tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #14
Source Project: TextLevelGCN Author: HuangLianzhe File: pre_processing.py License: GNU General Public License v3.0 | 5 votes |
def stem_corpus(): stemmer = WordNetLemmatizer() with open('data/mr/text_train.txt') as f: raw_text = f.read() with open('data/mr/label_train.txt') as f: raw_labels = f.read() labels = [] for raw_label in raw_labels.split('\n'): if raw_label == '1': labels.append('pos') elif raw_label == '0': labels.append('neg') else: if len(raw_label) == 0: continue raise ValueError(raw_label) corpus = raw_text.split('\n') corpus = [clean_str(doc) for doc in corpus] corpus = [remove_short(doc) for doc in corpus] tokenized_corpus = [word_tokenize(doc) for doc in corpus] results = [] for line in tokenized_corpus: results.append(' '.join([stemmer.lemmatize(word) for word in line])) results = list(zip(labels, results)) results = ['\t'.join(line) for line in results] random.shuffle(results) with open('data/mr/mr-train-stemmed.txt', 'w') as f: f.write('\n'.join(results))
Example #15
Source Project: EntityDuetNeuralRanking Author: thunlp File: cmns.py License: MIT License | 5 votes |
def _phrase_stem(cls, phrase): wnl = WordNetLemmatizer() l_term = phrase.split() l_term = [wnl.lemmatize(term, 'n') for term in l_term] return ' '.join(l_term)
Example #16
Source Project: ResumeParser Author: OmkarPathak File: utils.py License: MIT License | 5 votes |
def extract_experience(resume_text): ''' Helper function to extract experience from resume text :param resume_text: Plain resume text :return: list of experience ''' wordnet_lemmatizer = WordNetLemmatizer() stop_words = set(stopwords.words('english')) # word tokenization word_tokens = nltk.word_tokenize(resume_text) # remove stop words and lemmatize filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] sent = nltk.pos_tag(filtered_sentence) # parse regex cp = nltk.RegexpParser('P: {<NNP>+}') cs = cp.parse(sent) # for i in cs.subtrees(filter=lambda x: x.label() == 'P'): # print(i) test = [] for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')): test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2])) # Search the word 'experience' in the chunk and then print out the text after it x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()] return x
Example #17
Source Project: pliers Author: tyarkoni File: text.py License: BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, stemmer='porter', tokenize=True, case_sensitive=False, *args, **kwargs): if isinstance(stemmer, str): if stemmer not in self._stemmers: valid = list(self._stemmers.keys()) raise ValueError("Invalid stemmer '%s'; please use one of %s." % (stemmer, valid)) stemmer = getattr(stem, self._stemmers[stemmer])(*args, **kwargs) elif not isinstance(stemmer, (stem.StemmerI, stem.WordNetLemmatizer)): raise ValueError("stemmer must be either a valid string, or an " "instance of class StemmerI.") self.stemmer = stemmer self.tokenize = tokenize self.case_sensitive = case_sensitive super().__init__()
Example #18
Source Project: pliers Author: tyarkoni File: text.py License: BSD 3-Clause "New" or "Revised" License | 5 votes |
def _filter(self, stim): pos_map = { 'ADJ': 'a', 'ADJ_SAT': 's', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v' } def pos_wordnet(txt): pos_tagged = dict(nltk.pos_tag(txt, tagset='universal')) pos_tagged = {t: pos_map[tag] if tag in pos_map else 'n' for t, tag in pos_tagged.items()} return pos_tagged tokens = [stim.text] if self.tokenize: tokens = nltk.word_tokenize(tokens[0]) tokens = [t if self.case_sensitive else t.lower() for t in tokens] if not isinstance(self.stemmer, stem.WordNetLemmatizer): stemmed = ' '.join([self.stemmer.stem(t) for t in tokens]) else: pos_tagged = pos_wordnet(tokens) stemmed = ' '.join([self.stemmer.lemmatize(t, pos=pos_tagged[t]) for t in tokens]) return TextStim(stim.filename, stemmed, stim.onset, stim.duration, stim.order, stim.url)
Example #19
Source Project: tatk Author: thu-coai File: reader.py License: Apache License 2.0 | 5 votes |
def __init__(self): super().__init__() self.entity_dict = {} self.abbr_dict = {} self.wn = WordNetLemmatizer() self.db = {} self.tokenized_data_path = './data/kvret/' self._construct(cfg.train, cfg.dev, cfg.test, cfg.entity)
Example #20
Source Project: LDA_RecEngine Author: easonchan1213 File: LDAModel_English.py License: Apache License 2.0 | 5 votes |
def __tokenizeWholeCorpora(self,pathToCorpora): print 'Start tokenzing the corpora: %s' % (pathToCorpora) punct = re.compile('[%s]' % re.escape(string.punctuation)) wnl = WordNetLemmatizer() doc_count=0 train_set = [] doc_mapping = {} link_mapping = {} for f in glob(pathToCorpora+'/*'): filereader = open(f, 'r') article = filereader.readlines();filereader.close() text = '' try: link = article[0] title = article[1] text = article[2].lower() except IndexError: continue # Skip document length < min_length if len(text) < self.min_length: continue text = punct.sub("",text) # Remove all punctuations tokens = nltk.word_tokenize(text) # Tokenize the whole text # Lemmatize every word and add to tokens list if the word is not in stopword train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) # Build doc-mapping doc_mapping[doc_count] = title link_mapping[doc_count] = link doc_count = doc_count+1 if doc_count % 10000 == 0: print 'Have processed %i documents' % (doc_count) print 'Finished tokenzing the copora: %s' % (pathToCorpora) return doc_count,train_set,doc_mapping,link_mapping
Example #21
Source Project: yelp Author: melqkiades File: belford_tfidf.py License: GNU Lesser General Public License v2.1 | 5 votes |
def preprocess( docs, stopwords, min_df=3, min_term_length=2, ngram_range=(1, 1), apply_tfidf=True, apply_norm=True, lemmatize=False): """ Preprocess a list containing text documents stored as strings. """ token_pattern = re.compile(r"\b\w\w+\b", re.U) if lemmatize: from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() def normalize(x): x = x.lower() if lemmatize: return wnl.lemmatize(x) return x def custom_tokenizer(s): return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit # length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer( stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=None, use_idf=apply_tfidf, norm=norm_function, min_df=min_df, ngram_range=ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[v[term]] = term return (X, terms, tfidf)
Example #22
Source Project: Deep_Enhanced_Repr_for_IDRR Author: hxbai File: pdtb2.py License: MIT License | 5 votes |
def __lemmatize(self, lemma): """ Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma. """ string, tag = lemma if tag in ('a', 'n', 'r', 'v'): wnl = WordNetLemmatizer() string = wnl.lemmatize(string, tag) return (string, tag) ###################################################################### # POSITIONING.
Example #23
Source Project: find-all-the-new-words Author: Steven-AA File: dictionary.py License: MIT License | 5 votes |
def eudic(word): url = "https://dict.eudic.net/dicts/en/"+word headers = {"authority":"dict.eudic.net", "content-type":"application/x-www-form-urlencoded", "origin": "https://dict.eudic.net", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } r = requests.get(url,headers) soup = BeautifulSoup(r.text,"lxml") try: a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n") except Exception as e: logger.warning(word+" "+str(e)) from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() word = wordnet_lemmatizer.lemmatize(word) url = "https://dict.eudic.net/dicts/en/"+word r = requests.get(url,headers) soup = BeautifulSoup(r.text,"lxml") try: a = soup.find_all(id="ExpFCChild")[0].get_text("\t\n") a = word+"\n\n"+a except Exception as e: logger.warning("fix failed "+word+" "+str(e)) return "" if a[0] == u"赞": a = a[a.index(")")+3:] a = fix_result(a) a = a.replace(".\t\n",".\t") return a
Example #24
Source Project: topic-ensemble Author: derekgreene File: util.py License: Apache License 2.0 | 5 votes |
def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ): """ Preprocess a list containing text documents stored as strings. """ token_pattern = re.compile(r"\b\w\w+\b", re.U) if lemmatize: from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() def normalize( x ): x = x.lower() if lemmatize: return wnl.lemmatize(x) return x def custom_tokenizer( s ): return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[ v[term] ] = term return (X,terms)
Example #25
Source Project: TextRank Author: naiveHobo File: preprocessing.py License: MIT License | 5 votes |
def __init__(self): self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt") self.LEMMATIZER = WordNetLemmatizer() self.STEMMER = SnowballStemmer("english") self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE) self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
Example #26
Source Project: bert-sense Author: uhh-lt File: BERT_Model.py License: MIT License | 5 votes |
def __init__(self, device_number = 'cuda:2', use_cuda=True): self.device_number = device_number self.use_cuda = use_cuda self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4} self.Bert_Model = BERT(device_number, use_cuda) self.lemmatizer = WordNetLemmatizer()
Example #27
Source Project: bert-sense Author: uhh-lt File: ELMO_Model.py License: MIT License | 5 votes |
def __init__(self): self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4} self.Elmo_Model = ELMO() self.lemmatizer = WordNetLemmatizer()
Example #28
Source Project: bert-sense Author: uhh-lt File: Flair_Model.py License: MIT License | 5 votes |
def __init__(self, device_number = 'cuda:2', use_cuda=True): self.device_number = device_number self.use_cuda = use_cuda self.sense_number_map = {'N':1, 'V':2, 'J':3, 'R':4} self.Flair_Model = Flair(device_number, use_cuda) self.lemmatizer = WordNetLemmatizer()
Example #29
Source Project: Flavor-Network Author: lingcheng99 File: recipe_cleanup.py License: GNU General Public License v3.0 | 5 votes |
def split_ingr(x): wnl=WordNetLemmatizer() cleanlist=[] lst = x.strip('[]').split(',') cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst] return cleanlist #remove low-information words from ingredients, could use more
Example #30
Source Project: claf Author: naver File: exact_match_indexer.py License: MIT License | 5 votes |
def __init__(self, tokenizer, lower=True, lemma=True): super(ExactMatchIndexer, self).__init__(tokenizer) self.param_key = "question" self.lemmatizer = WordNetLemmatizer() self.lower = lower self.lemma = lemma