Python nltk.stem() Examples

The following are code examples for showing how to use nltk.stem(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: BIOLITMAP   Author: inab   File: TopicModeling.py    GNU Lesser General Public License v2.1 6 votes vote down vote up
def process_texts(texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    texts = [[word for word in line if word not in stops] for line in texts]
    texts = [bigram[line] for line in texts]
    texts = [[word.split(b'/')[0] for word in lemmatize(' '.join(line), min_length=5)] for line in texts]
    return texts 
Example 2
Project: tweepy   Author: dalinhuang99   File: search.py    MIT License 6 votes vote down vote up
def clean_tweet(self, tweet):
        """
        Utility function to classify sentiment of passed tweet
        using textblob's sentiment method
        """
        # punctuation = list(string.punctuation)
        swords = set(stopwords.words("english"))
        ps = PorterStemmer()

        # Step 1 - emoji to text
        tweet = emoji.demojize(tweet)

        # Step 2 - tokenizing tweet - list
        tweet = self.tokenizing_tweet(tweet)

        # Step 3 - reduce length (ex: awwwwsome)
        tweet = [self.reduce_lengthening(term) for term in tweet]

        # Step 4 - remove stopwords then stemming word
        tweet = [ps.stem(term) for term in tweet if term not in set(swords)]

        return tweet 
Example 3
Project: Gap   Author: andrewferlitsch   File: syntax.py    Apache License 2.0 6 votes vote down vote up
def _nltkStemmer(self, name):
        """ NLTK Stemmer """
        if name == 'porter':
            stemmer = PorterStemmer()
        elif name == 'snowball':
            stemmer = SnowballStemmer("english")
        elif name == "lancaster":
            stemmer = LancasterStemmer()
        else:
            return
        
        length = len(self._words)
        for i in range(length):
            word = self._words[i]['word']
            l = len(word)

            # Don't stem short words or words already categorized
            if l < 4 or self._words[i]['tag'] != Vocabulary.UNTAG:
                continue
            
            self._words[i]['word'] = stemmer.stem(self._words[i]['word']) 
Example 4
Project: exsto   Author: DerwenAI   File: TextRank.py    Apache License 2.0 6 votes vote down vote up
def wrap_words (pair):
  """wrap each (word, tag) pair as an object with fully indexed metadata"""
  global STEMMER
  index = pair[0]
  result = []
  for word, tag in pair[1]:
    word = word.lower()
    stem = STEMMER.stem(word)
    if stem == "":
      stem = word
    keep = tag in ('JJ', 'NN', 'NNS', 'NNP',)
    result.append({ "id": 0, "index": index, "stem": stem, "word": word, "tag": tag, "keep": keep })
    index += 1
  return result


######################################################################
## build a graph from raw text 
Example 5
Project: RealtimeSentimentAnalysis   Author: zHaytam   File: models.py    MIT License 6 votes vote down vote up
def extract_words(text, handle_negation=False):
    # words = word_tokenize(text)
    words = []
    for word in word_tokenize(text):
        word = word.lower()
        if word in NEGATE:
            words.append(word)
        elif word in string.punctuation:
            continue
        else:
            nw = reg.sub('', word)
            if len(nw) > 0:
                words.append(nw)

    if handle_negation:
        words = negation_handler(words)

    words = [word for word in words if word not in stopwords]
    words = [stemmer.stem(word) for word in words]
    # words = [lemmatizer.lemmatize(word.lower()) for word in words]
    return words 
Example 6
Project: Topic-Distance-and-Coherence   Author: Renata1995   File: MySentenceStemmer.py    Apache License 2.0 6 votes vote down vote up
def stem(self, sentence):
		#print sentence
		list1 = nltk.word_tokenize(sentence);
		taglist = nltk.pos_tag(list1);
		#print taglist
		self.res = [];
		for s in taglist:
			tag = s[1];
			pos0 = ' ';
			if (tag[0] == 'N'):
				pos0 = 'n'
			elif (tag[0] == 'V'):
				pos0 = 'v'
			elif (tag[0] == 'J'):
				pos0 = 'a'
			elif (tag[0] == 'R'):
				pos0 = 'r'

			if s[0] not in self.st:
				if (pos0.isalpha()):
					self.res.append(self.wstem.lemmatize(s[0], pos=pos0));
				else:
					self.res.append(self.sstem.stem(s[0]));
		return self.res 
Example 7
Project: aop-helpFinder   Author: jecarvaill   File: aop_data_module.py    GNU General Public License v3.0 6 votes vote down vote up
def load_AOD_data(AO_filename=AO_FILE, diseases_filename=CTD_FILE):
    """Method to get a dict of adverse outcomes and / or diseases in
       function of input file(s).

    Return:
        (dict): a dict which contains strings corresponding to adverse
        outcomes or diseases or both (values). The keys of the dict
        correspond to stem of adverse outcomes or diseases.

    """
    if AO_filename:
        AO = AOD_file(filename=AO_filename, filetype='AO')
    if diseases_filename:
        D = AOD_file(filename=diseases_filename, filetype='D')
    if AO_filename and diseases_filename:
        AOD = AOD_merging(AO, D)
        return AOD
    elif AO_filename:
        return AO
    elif diseases_filename:
        return D 
Example 8
Project: aop-helpFinder   Author: jecarvaill   File: aop_data_module.py    GNU General Public License v3.0 6 votes vote down vote up
def load_animals(animals_filename=ANIMALS_FILE):
    """Method to get a dict of common names of animals contributing generally
       in biology studies.

    Return:
        animals (dict): a dict which contains common names of animals
        (values). The keys of the dict correspond to stem of common names.

    """
    animals = {}
    filin = open(animals_filename, encoding='utf-8')
    species = filin.readlines()
    for common_name in species:
        animal = tm_module.clean_abstract(common_name, True)
        if animal not in animals.keys():
            animals[animal] = common_name.strip()
    return animals


######################################
# Main()
###################################### 
Example 9
Project: PYSHA   Author: shafaypro   File: __chatcheck.py    GNU General Public License v3.0 6 votes vote down vote up
def respond(sentences):
    tokenized_sentence = sent_tokenize(sentences)
    stop_words = set(stopwords.words("english"))  # Getting the stop words from the Local DB
    if len(tokenized_sentence) > 1:  # if the length of the tokenized sentence is greater than one

        # for sentence in tokenized_sentence:
        #     words = word_tokenize(sentence)  # Each word is tokenized
            pos_tagged = parts_of_speechtag(sentences)
            print(tuple(pos_tagged))
            # filtered_words = [w for w in words if w not in stop_words]  # removing the additional stop words for
            # portStemer_object = PorterStemmer()
            # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
            # return filtered_steam_words
    else:
        pos_tagged = parts_of_speechtag(sentences)
        print(type(pos_tagged))
        # words = word_tokenize(sentences)
        # filtered_words = [w for w in words if w not in stop_words]
        # portStemer_object = PorterStemmer()
        # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
        #return filtered_steam_words 
Example 10
Project: PYSHA   Author: shafaypro   File: __chatcheck.py    GNU General Public License v3.0 6 votes vote down vote up
def respond(sentences):
    tokenized_sentence = sent_tokenize(sentences)
    stop_words = set(stopwords.words("english"))  # Getting the stop words from the Local DB
    if len(tokenized_sentence) > 1:  # if the length of the tokenized sentence is greater than one

        # for sentence in tokenized_sentence:
        #     words = word_tokenize(sentence)  # Each word is tokenized
            pos_tagged = parts_of_speechtag(sentences)
            print(tuple(pos_tagged))
            # filtered_words = [w for w in words if w not in stop_words]  # removing the additional stop words for
            # portStemer_object = PorterStemmer()
            # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
            # return filtered_steam_words
    else:
        pos_tagged = parts_of_speechtag(sentences)
        print(type(pos_tagged))
        # words = word_tokenize(sentences)
        # filtered_words = [w for w in words if w not in stop_words]
        # portStemer_object = PorterStemmer()
        # filtered_steam_words = [portStemer_object.stem(w) for w in filtered_words]
        #return filtered_steam_words 
Example 11
Project: RecSys2019_DeepLearning_Evaluation   Author: MaurizioFD   File: TagPreprocessing.py    GNU Affero General Public License v3.0 6 votes vote down vote up
def tagFilterAndStemming(originalTag):

    # Remove non alphabetical character and split on spaces
    processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag)
    processedTag = re.sub(" +", " ", processedTag)

    processedTag = processedTag.split(" ")

    stopwords_set = set(stopwords.words('english'))

    stemmer = PorterStemmer()

    result = []

    for tag in processedTag:

        tag_stemmed = stemmer.stem(tag)

        if tag_stemmed not in stopwords_set:
            result.append(tag_stemmed)

    return result 
Example 12
Project: text-analysis-and-classification-gui-tool   Author: acrobotache   File: preprocessing.py    MIT License 5 votes vote down vote up
def simple_stemming(text: str):
    stemmer = LancasterStemmer()
    text = " ".join([stemmer.stem(word.lower()) for word in text.split()])
    return text 
Example 13
Project: text-analysis-and-classification-gui-tool   Author: acrobotache   File: preprocessing.py    MIT License 5 votes vote down vote up
def porter_stemming(text: str):
    stemmer = PorterStemmer()
    text = " ".join([stemmer.stem(word.lower()) for word in text.split()])
    return text 
Example 14
Project: text-analysis-and-classification-gui-tool   Author: acrobotache   File: preprocessing.py    MIT License 5 votes vote down vote up
def snowball_stemming(text: str):
    stemmer = SnowballStemmer("english")
    text = " ".join([stemmer.stem(word.lower()) for word in text.split()])
    return text 
Example 15
Project: tulo-chatbot   Author: usriva2405   File: vectorizers.py    MIT License 5 votes vote down vote up
def __call__(self, doc):
        tokens = [word for word in nltk.word_tokenize(doc) if len(word) > 1]
        return [self.stemmer.stem(item) for item in tokens] 
Example 16
Project: MOQA   Author: pprakhar30   File: documents.py    MIT License 5 votes vote down vote up
def __init__(self, itemId, questionType, answerType, question, answer, V, WordIDMap):

		self.itemId 		= itemId
		self.questionType 	= questionType
		self.answerType 	= answerType
		self.question 		= question
		self.answer 		= answer
		self.Question 		= [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(question) if stemmer.stem(word) in WordIDMap]
		self.Answer 		= [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(answer) if stemmer.stem(word) in WordIDMap]
		self.qFeature 		= {}
		self.aFeature 		= {}
		self.create_QAFeature() 
Example 17
Project: MOQA   Author: pprakhar30   File: documents.py    MIT License 5 votes vote down vote up
def __init__(self, itemId, Review, V, WordIDMap, ReviewObj):

		self.itemId 	= itemId
		self.sent 	= Review
		self.rObj 	= ReviewObj
		self.Sent 	= [WordIDMap[stemmer.stem(word)] for word in tokenizer.tokenize(Review) if stemmer.stem(word) in WordIDMap]
		self.sFeature 	= {} 
Example 18
Project: BIOLITMAP   Author: inab   File: TopicModeling.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzernew(doc)) 
Example 19
Project: BIOLITMAP   Author: inab   File: TopicModeling.py    GNU Lesser General Public License v2.1 5 votes vote down vote up
def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([english_stemmer.stem(w) for w in analyzer(doc)]) 
Example 20
Project: search_relevance   Author: rmanak   File: nlp_utils.py    MIT License 5 votes vote down vote up
def stem(self,word,pos=u'n'):
        return self.lemmatize(word,pos)


########  Wrapper for all  of the popular stemmers ########### 
Example 21
Project: search_relevance   Author: rmanak   File: nlp_utils.py    MIT License 5 votes vote down vote up
def __init__(self,stemmer_type):
        self.stemmer_type = stemmer_type
        if (self.stemmer_type == 'porter'):
            self.stemmer = nltk.stem.PorterStemmer()
        elif (self.stemmer_type == 'snowball'):
            self.stemmer = nltk.stem.SnowballStemmer('english')
        elif (self.stemmer_type == 'lemmatize'):
            self.stemmer = WordNetStemmer()
        else:
            raise NameError("'"+stemmer_type +"'" + " not supported")
            


######## Simple wordreplacer object using a dictionary  ############ 
Example 22
Project: search_relevance   Author: rmanak   File: nlp_utils.py    MIT License 5 votes vote down vote up
def normalize(self, text):
        return [self.stemmer.stem(token) 
                for token in self.tokenizer.tokenize(text.lower()) 
                if token not in self.stop_words]
                    
######### defining a default normalizer ########## 
Example 23
Project: search_relevance   Author: rmanak   File: nlp_utils.py    MIT License 5 votes vote down vote up
def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
        


########## Stemmer + CountVectorizer wrapper ############# 
Example 24
Project: search_relevance   Author: rmanak   File: nlp_utils.py    MIT License 5 votes vote down vote up
def build_analyzer(self):
        analyzer = super(CountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
        
        
########## Defaults TF-IDF & Count Vectorizers ########
        
        
#======== TF-IDF Vectorizer =========# 
Example 25
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.5 Skipgram_Keras.py    MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 26
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: 9.2 Email_Classification.py    MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example 27
Project: easy_learner   Author: lavizhao   File: statistics.py    Apache License 2.0 5 votes vote down vote up
def clean_words(nce):
    nce = nce.replace('’','\'')
    nce = nce.replace('‘','\'')
    words = wt(nce)
    words = set([wnl.lemmatize(word) for word in words])
    words = set([stemmer.stem(word) for word in words])
    
    return set(words)

#分析文本相关特征 
Example 28
Project: texttk   Author: fmpr   File: texttk.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, decode_error='strict', strip_accents='unicode', ignore_list=[], lowercase=True, \
						remove_html=True, join_urls=True, use_bigrams=True, use_ner=True, stanford_ner_path="", \
						use_lemmatizer=False, max_df=0.95, min_df=1, max_features=None):
		self.stanford_ner_path = stanford_ner_path		# path to stanford NER
		self.decode_error = decode_error				# options: {‘strict’, ‘ignore’, ‘replace’}
		self.strip_accents = strip_accents				# options: {‘ascii’, ‘unicode’, None}
		self.ignore_list = ignore_list
		self.lowercase = lowercase
		self.remove_html = remove_html
		self.join_urls = join_urls	
		self.use_bigrams = use_bigrams
		self.use_ner = use_ner
		self.use_lemmatizer = use_lemmatizer			# use lemmatizer instead of stemmer?
		self.max_df = max_df							# maximum document frequency
		self.min_df = min_df							# remove terms that occur in less than min_df documents
		self.max_features = max_features 				# keep only top-N words according to tf across corpus

		self.sentence_splitter = PunktSentenceTokenizer().tokenize 		# Punkt sentence splitter
		self.stemmer = SnowballStemmer("english").stem					# Snowball stemmer
		self.lemmatizer = WordNetLemmatizer().lemmatize 				# WordNet lemmatizer
		self.base_tokenizer = CountVectorizer().build_tokenizer()		# sklearn tokenizer works the best, I think...
		self.stop_words = stopwords.words("english")					# nltk list of 128 stopwords
		self.token_pattern = re.compile(r'(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b') 	# default value was r'(?u)\b\w\w+\b'
		self.numeric_pattern = re.compile(r'^[0-9]+$')					# number regex
		self.url_pattern = re.compile(r'((http://)?(www\..*?\.\w+).*?)\s')
		self.compound_pattern = re.compile(r'\w+(\-\w+)+')
		if self.use_lemmatizer:
			self.tokenizer = CustomTokenizer(self.base_tokenizer, self.lemmatizer, self.token_pattern, self.numeric_pattern)
		else:
			self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer, self.token_pattern, self.numeric_pattern) 
Example 29
Project: message-analyzer   Author: glasses-n-contacts   File: analyzer.py    MIT License 5 votes vote down vote up
def word_tokenize(self, extend_list=False):
        all_tokens = []
        ps = PorterStemmer()
        if extend_list:
            for message in self.text:
                # all_tokens.extend(message.split())
                all_tokens.extend(nltk.word_tokenize(message.translate(string.punctuation)))
            all_tokens = [ps.stem(w) for w in all_tokens if w not in stop_words]  # and w.isalpha()]
        else:
            all_tokens = [nltk.word_tokenize(message.translate(string.punctuation)) for message in self.text]
        return all_tokens 
Example 30
Project: Machine-Learning   Author: LITDataScience   File: topic_modelling.py    Apache License 2.0 5 votes vote down vote up
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v')) 
Example 31
Project: Twitter-Personalities   Author: rohilshah95   File: pyPredict.py    GNU General Public License v3.0 5 votes vote down vote up
def preproc(s):
	#s=emoji_pattern.sub(r'', s) # no emoji
	s= unidecode(s)
	POSTagger=preprocess(s)
	#print(POSTagger)

	tweet=' '.join(POSTagger)
	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(tweet)
	#filtered_sentence = [w for w in word_tokens if not w in stop_words]
	filtered_sentence = []
	for w in POSTagger:
	    if w not in stop_words:
	        filtered_sentence.append(w)
	#print(word_tokens)
	#print(filtered_sentence)
	stemmed_sentence=[]
	stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
	for w in filtered_sentence:
		stemmed_sentence.append(stemmer2.stem(w))
	#print(stemmed_sentence)

	temp = ' '.join(c for c in stemmed_sentence if c not in string.punctuation) 
	preProcessed=temp.split(" ")
	final=[]
	for i in preProcessed:
		if i not in final:
			if i.isdigit():
				pass
			else:
				if 'http' not in i:
					final.append(i)
	temp1=' '.join(c for c in final)
	#print(preProcessed)
	return temp1 
Example 32
Project: Election-Meddling   Author: zadewg   File: deploy.py    MIT License 5 votes vote down vote up
def data_preparation(tweet): #nltk.tag._POS_TAGGER #treebank tag set https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
	
	url_regex = r'https?:\/\/(www\.)?[[email protected]:%._\+~#=]{2,256}\.[a-z]{2,6}\b([[email protected]:%_\+.~#?&//=]*)'

	clean = re.sub(url_regex, '', tweet, flags = re.MULTILINE)                                                # strip out urls. urls, ew, nasty.
	clean = clean.replace('\n', ' ').replace("'", " ").replace('"', ' ')

	try:	
		clean = clean.decode("utf-8-sig").replace(u"\ufffd", "?")                                         # strip out Byte Order Marks
		print("Detected BOS")
	except:
		pass
	
	clean = re.sub(r'[^a-zA-Z ]', '', clean, flags = re.MULTILINE)                                            # the "#" symbol is actually called octothorpe. bananas.
	
	tokens = splitter.split(clean)										  # Tokeniztion

	lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens)					  # Part of speech tagging.
	out = ' '.join([out[1] for out in lemma_pos_token[0]])
	return out

	''' #https://pypi.org/project/hunspell/ #Double tokenizing. hunspell for units, nltk for context.
	import hunspell

	hobj = hunspell.HunSpell('/usr/share/myspell/en_US.dic', '/usr/share/myspell/en_US.aff')
	hobj.spell('spookie')

	hobj.suggest('spookie')

	hobj.spell('spooky')

	hobj.analyze('linked')

	hobj.stem('linked')
	''' 
Example 33
Project: RealtimeSentimentAnalysis   Author: zHaytam   File: sentiment_analysis.py    MIT License 5 votes vote down vote up
def extract_words(self, text):
        words = [word for word in word_tokenize(text)]
        words = self.remove_stopwords(words)
        # words = [stemmer.stem(word) for word in words]
        words = [self.stemmer.stem(word.lower()) for word in words]
        return words 
Example 34
Project: sunbird-ml-workbench   Author: project-sunbird   File: nlp.py    MIT License 5 votes vote down vote up
def stem_lem(keyword_list, DELIMITTER):
    wordnet_lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    keyword_list = [item for item in keyword_list]
    keyword_list = [i.split(DELIMITTER) for i in keyword_list]
    lemma_ls_1 = [[wordnet_lemmatizer.lemmatize(
        item, pos="n") for item in words] for words in keyword_list]
    lemma_ls_2 = [[wordnet_lemmatizer.lemmatize(
        item, pos="v") for item in words] for words in lemma_ls_1]
    lemma_ls_3 = [[wordnet_lemmatizer.lemmatize(
        item, pos="a") for item in words] for words in lemma_ls_2]
    lemma_ls_4 = [[wordnet_lemmatizer.lemmatize(
        item, pos="r") for item in words] for words in lemma_ls_3]
    stemm_ls = [[stemmer.stem(item) for item in words] for words in lemma_ls_4]
    return [DELIMITTER.join(i) for i in stemm_ls] 
Example 35
Project: Fake_News_Detection   Author: nishitpatel01   File: DataPrep.py    MIT License 5 votes vote down vote up
def stem_tokens(tokens, stemmer):
    stemmed = []
    for token in tokens:
        stemmed.append(stemmer.stem(token))
    return stemmed

#process the data 
Example 36
Project: Fake_News_Detection   Author: nishitpatel01   File: DataPrep.py    MIT License 5 votes vote down vote up
def process_data(data,exclude_stopword=True,stem=True):
    tokens = [w.lower() for w in data]
    tokens_stemmed = tokens
    tokens_stemmed = stem_tokens(tokens, eng_stemmer)
    tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ]
    return tokens_stemmed


#creating ngrams
#unigram 
Example 37
Project: Fake_News_Detection   Author: nishitpatel01   File: DataPrep.py    MIT License 5 votes vote down vote up
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

#doc = ['runners like running and thus they run','this is a test for tokens']
#tokenizer([word for line in test_news.iloc[:,1] for word in line.lower().split()])

#show the distribution of labels in the train and test data 
Example 38
Project: allen-ai-science-qa   Author: arranger1044   File: preprocess.py    GNU General Public License v3.0 5 votes vote down vote up
def stem_tokens(tokens, stemmer=None, stemmer_type='snowball'):

    if not stemmer:
        stemmer = get_stemmer(stemmer_type)

    return [stemmer.stem(t) for t in tokens] 
Example 39
Project: CounterfietAppClassify   Author: jumormt   File: text_preprocess.py    GNU General Public License v3.0 5 votes vote down vote up
def lemmatize_string(pos_tags):
    '''词形还原后词干提取函数

    :param pos_tags:
    :return: 还原后的单词列表
    '''

    res = []
    lemmatizer = WordNetLemmatizer()  # 初始化词形还原对象
    stemmer = SnowballStemmer("english")  # 选择语言,初始化词干提取对象
    for word, pos in pos_tags:
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(stemmer.stem(lemmatizer.lemmatize(word, pos=wordnet_pos)))
    return res 
Example 40
Project: Python-Forensics   Author: mpedzi03   File: _classNLTKQuery.py    GNU General Public License v3.0 5 votes vote down vote up
def textCorpusInit(self, thePath):
        # Validate the path is a directory
        if not os.path.isdir(thePath):
            return "Path is not a Directory"
        # Validate the path is readable
        if not os.access(thePath, os.R_OK):
            return "Directory is not Readable"
        # Attempt to Create a corpus with all .txt files found in Directory
        try:
            self.Corpus = PlaintextCorpusReader(thePath, '.*')
            print "Processing Files:"
            print self.Corpus.fileids()
            print "Please wait...."
            self.rawText = self.Corpus.raw()
            self.tokens = nltk.word_tokenize(self.rawText)
            upperstop = [word.upper() for word in stopwords.words('english')]
            self.tokens_nostop = [t for t in self.tokens if t not in upperstop]
            
                                  
            self.TextCorpus = nltk.Text(self.tokens)
            
            self.TextCorpusNoStop = nltk.Text(self.tokens_nostop)

            self.stemmer = PorterStemmer()
            self.stemmedTokens = [self.stemmer.stem(t.lower()) for t in self.tokens_nostop]
            self.stemmedText = nltk.Text(self.stemmedTokens)
            
            self.PosTaggedCorpus =nltk.pos_tag(self.tokens)
            
        except:
            return "Corpus Creation Failed"

        self.ActiveTextCorpus = True
        return "Success" 
Example 41
Project: Python-Forensics   Author: mpedzi03   File: _classNLTKQuery.py    GNU General Public License v3.0 5 votes vote down vote up
def searchStemmedKeyword(self):
        print
        wordToStem = raw_input("Enter a single token to stem: ")
        wordStemmed = self.stemmer.stem(wordToStem)
        print "The stemmed version of the word you input is " + wordStemmed
        print
        print "Searching Corpus for your word . ."
        if wordStemmed:
            wordCount = self.stemmedText.count(wordStemmed)
            print wordStemmed + " occured: ",
            print wordCount,
            print " times"
        else:
            print "Token entry is invalid" 
Example 42
Project: lam   Author: aghie   File: preprocess.py    MIT License 5 votes vote down vote up
def lemmatize_words(docs):
    
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    
    new_docs = OrderedDict({})
    for doc_id, doc in docs.items():
        new_docs[doc_id] = []
        for word,postag in doc:
            try:
                lemma = wordnet_lemmatizer.lemmatize(word,pos=postag.lower()[0])
                new_docs[doc_id].append((lemma.lower(),postag))
            except KeyError:
                lemma = word
                new_docs[doc_id].append((lemma.lower(),postag))
                
    return new_docs 
Example 43
Project: sentec   Author: jvas28   File: es_nlp.py    GNU General Public License v3.0 5 votes vote down vote up
def stem(word):
    return stemmer.stem(word)

# funcion de extraccion de raices lexicales sobre una lista 
Example 44
Project: sentec   Author: jvas28   File: es_nlp.py    GNU General Public License v3.0 5 votes vote down vote up
def stem_tokens(tokens):  
    stemmed = []
    for item in tokens:
        stemmed.append(stem(item))
    return stemmed
# funcion de disgregacion de palabras 
Example 45
Project: Plagiarism-Checker   Author: ieeecompsoc   File: trigram_check.py    MIT License 5 votes vote down vote up
def pre_processing(text):
        sent_tokenize_list = sent_tokenize(text)
        #print sent_tokenize_list
        #print len(sent_tokenize_list)
        #tokenise words
        #print stop_words
        words=word_tokenize(text)
        stop_words = str(stopwords.words('english'))
        alpha=stop_words.replace("u'", "")
    #print words
        result = []
    #print alpha
    #remove stop words
        for item in words:
            if item not in alpha:
                result.append(item)
    #print "Filtered",result
        fil=str(result)
    #remove punctuation
        repstr=" " * 32
        table=string.maketrans(string.punctuation,repstr)
        s=fil.translate(table)
    #return s


    #lemmatizing
        lemmatizer=WordNetLemmatizer()
        h=lemmatizer.lemmatize(s)
    #print "Lemma",lemmatizer.lemmatize(s)
    #stemming
        wordss=word_tokenize(h)
        ps=PorterStemmer()
        list1=[]
        for i in wordss:
            k=(ps.stem(i))
            list1.append(k)
    #print list1
        final= '  '.join(list1)
        finall=str(final)
        return finall 
Example 46
Project: pbase   Author: Impavidity   File: features.py    MIT License 5 votes vote down vote up
def addCorpus(self, corpusName, path, textIndex, dilimiter='\t', tokenize=True, stem=True):
        fin = open(path)
        for line in tqdm(fin.readlines()):
            items = line.strip().split(dilimiter)
            sentence = items[textIndex]
            if tokenize:
                tokens = tokenizer.tokenize(sentence)
            else:
                tokens = sentence.split()
            if stem:
                tokens = [stemmer.stem(token) for token in tokens]
            self.corpus[corpusName].append(" ".join(tokens))
        fin.close() 
Example 47
Project: retrieval-based-chatbot   Author: amycardoso   File: create_ubuntu_dataset_modificado.py    MIT License 5 votes vote down vote up
def create_eval_dataset(args, file_list_csv):
        rng = random.Random(args.seed)
        # training dataset
        f = open(os.path.join("meta", file_list_csv), 'r')
        dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f))

        data_set = create_examples(dialog_paths,
                                   len(dialog_paths),
                                   lambda context_dialog, candidates : create_single_dialog_test_example(context_dialog, candidates, rng,
                                                                     args.n, args.max_context_length, args.turn))
        # output the dataset
        w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8')
        # header
        header = ["Context", "Ground Truth Utterance"]
        header.extend(map(lambda x: "Distractor_{}".format(x), xrange(args.n)))
        w.writerow(header)

        stemmer = SnowballStemmer("english")
        lemmatizer = WordNetLemmatizer()

        for row in data_set:
            translated_row = [row[0], row[1]]
            translated_row.extend(row[2])
            
            if args.tokenize:
                translated_row = map(nltk.word_tokenize, translated_row)
                if args.stem:
                    translated_row = map(lambda sub: map(stemmer.stem, sub), translated_row)
                if args.lemmatize:
                    translated_row = map(lambda sub: map(lambda tok: lemmatizer.lemmatize(tok, pos='v'), sub), translated_row)
                    
                translated_row = map(lambda x: " ".join(x), translated_row)

            w.writerow(translated_row)
        print("Dataset stored in: {}".format(args.output)) 
Example 48
Project: retrieval-based-chatbot   Author: amycardoso   File: create_ubuntu_dataset_modificado.py    MIT License 5 votes vote down vote up
def train_cmd(args):

        rng = random.Random(args.seed)
        # training dataset

        f = open(os.path.join("meta", "trainfiles.csv"), 'r')
        dialog_paths = map(lambda path: os.path.join(args.data_root, "dialogs", path), convert_csv_with_dialog_paths(f))

        #chama a função que cria os exemplos de treino
        train_set = create_examples_train(dialog_paths,args.e, rng, args.p,  args.turn, max_context_length=args.max_context_length)

        stemmer = SnowballStemmer("english")
        lemmatizer = WordNetLemmatizer()

        # output the dataset
        w = unicodecsv.writer(open(args.output, 'w'), encoding='utf-8')
        # header
        w.writerow(["Context", "Utterance", "Label"])
        for row in train_set:
            translated_row = row

            if args.tokenize:
                translated_row = [nltk.word_tokenize(row[i]) for i in [0,1]]

                if args.stem:
                    translated_row = map(lambda sub: map(stemmer.stem, sub), translated_row)
                if args.lemmatize:
                    translated_row = map(lambda sub: map(lambda tok: lemmatizer.lemmatize(tok, pos='v'), sub), translated_row)

                translated_row = map(lambda x: " ".join(x), translated_row)
                translated_row.append(int(float(row[2])))

            w.writerow(translated_row)
        print("Train dataset stored in: {}".format(args.output)) 
Example 49
Project: hoot   Author: CatalystOfNostalgia   File: comment_processing.py    MIT License 5 votes vote down vote up
def tokenizeDocument(document):
    # remove punctuation (otherwise we have a bunch of empty tokens at the end)
    translate_table = dict((ord(char), " ") for char in string.punctuation)
    document = document.translate(translate_table)
    # tokenize
    tokenized_doc = nltk.word_tokenize(document)
    # stem
    snowball = stem.snowball.EnglishStemmer()
    tokenized_doc = [snowball.stem(word) for word in tokenized_doc]
    # remove stop words
    tokenized_doc = [word for word in tokenized_doc if word not in stopwords.words('english')]
    return tokenized_doc

# given the dictionary, return an array of all the tokenized comments 
Example 50
Project: aop-helpFinder   Author: jecarvaill   File: aop_data_module.py    GNU General Public License v3.0 5 votes vote down vote up
def AOD_file(filename, filetype):
    """Method to extract all diseases or adverse outcomes contain in a file.
       The file should be performed by CTD database or AOPwiki beforehand.

    Return:
        AOD (dict): a dict which contains strings corresponding to adverse
        outcomes or diseases. The keys of the dict correspond to stem of
        adverse outcomes or diseases.

    """
    AOD = {}
    f = open(filename, encoding='utf-8')
    lines = f.readlines()
    for line in lines:
        term = ''
        if filetype == 'D':
            if not line.startswith('#'):
                line = line.split('\t')[0]
            else:
                continue
        line = line.strip().lower()
        for word in STOPWORDS:
            line = line.replace(word, ' ')
        substr = line.split(',')[0]
        words = word_tokenize(line)
        words = tm_module.stem_process(words)
        stemline = ' '.join(words)
        term = stemline.split(',')[0]
        term = term.strip()
        stemline = stemline.replace(', ', ' ')
        stemline = stemline.replace(',', ' ')
        if stemline not in AOD.keys():
            AOD[stemline] = line
        if len(term.split()) > 1 and term not in AOD.keys():
            AOD[term] = substr

    f.close()
    return AOD 
Example 51
Project: aop-helpFinder   Author: jecarvaill   File: aop_data_module.py    GNU General Public License v3.0 5 votes vote down vote up
def AOD_merging(AO, D):
    """Method to merge two dicts in one dict with non-redundant information.

    Return:
        AOD (dict): a dict which contains strings corresponding to adverse
        outcomes and diseases. The keys of the dict correspond to stem of
        adverse outcomes or diseases.

    """
    # AOD = set(AO + D)
    AOD = {**AO, **D}
    return AOD 
Example 52
Project: DSSG19-Cochrane-PUBLIC   Author: alan-turing-institute   File: text_processing.py    MIT License 5 votes vote down vote up
def stemming(papers, col='tokens_no_stopwords'):

    """
    Reducing words to their root form (stem).

    Parameters
    ----------

    papers : DataFrame
        DataFrame containing the columns that stemming should be applied to.

    cols : list
        List of the columns (in papers) that stemming should be applied to.

    Returns
    -------

    papers : DataFrame
        DataFrame identical to the one provided as input, with in addition the
        stemmed columns (original column name + "_stemmed").
    """

    stemmer = PorterStemmer()

    # turn column into list (for speed, rather than pandas apply)
    column_list = list(papers[col])

    # new list to add stemmed phrases to
    column_list_stemmed = []

    for paper in column_list:

        # stem phrase and add to list
        stemmed_paper = stem_phrase(paper, stemmer)
        column_list_stemmed.append(stemmed_paper)

    return column_list_stemmed 
Example 53
Project: DSSG19-Cochrane-PUBLIC   Author: alan-turing-institute   File: text_processing.py    MIT License 5 votes vote down vote up
def stem_phrase(phrase, stemmer):

    """
    Helper function to create stemmed version of a phrase.

    Parameters
    ----------

    phrase : string
        Phrase to be stemmed.

    stemmer : nltk stemmer object

    Returns
    -------

    stemmed_phrase : string

    """

    split_phrase = phrase.split(" ")

    stemmed_split_phrase = [stemmer.stem(word) for word in split_phrase]

    stemmed_phrase = " ".join(stemmed_split_phrase)

    return stemmed_phrase 
Example 54
Project: PYSHA   Author: shafaypro   File: __NLPMODIFIED.py    GNU General Public License v3.0 5 votes vote down vote up
def steam_words(self, word):
        ps_obj = PorterStemmer()  # creating the port steamer
        steamed_word = ps_obj.stem(word)
        return steamed_word  # returns the steamed word to the main file .

    # Natural Language displaying setneces . 
Example 55
Project: merinjei_bot   Author: ivanmilevtues   File: PreprocessHateData.py    MIT License 5 votes vote down vote up
def tokenize(tweet: str) -> list:
        stemmer = SnowballStemmer("english")
        tokens = tweet.split()
        tokens = [stemmer.stem(w) for w in tokens]
        return tokens 
Example 56
Project: NLP-Fake-News-Challenge   Author: sarahannnicholson   File: feature_generation.py    Apache License 2.0 5 votes vote down vote up
def _named_entity_feature(self):
        """ Retrieves a list of Named Entities from the Headline and Body.
        Returns a list containing the cosine similarity between the counts of the named entities """
        stemmer = PorterStemmer()
        def get_tags(text):
            return pos_tag(word_tokenize(text.encode('ascii', 'ignore')))

        def filter_pos(named_tags, tag):
            return " ".join([stemmer.stem(name[0]) for name in named_tags if name[1].startswith(tag)])

        named_cosine = []
        tags = ["NN"]
        for stance in tqdm.tqdm(self._stances):
            stance_cosine = []
            head = get_tags(stance['originalHeadline'])
            body = get_tags(self._original_articles.get(stance['Body ID'])[:255])

            for tag in tags:
                head_f = filter_pos(head, tag)
                body_f = filter_pos(body, tag)

                if head_f and body_f:
                    vect = TfidfVectorizer(min_df=1)
                    tfidf = vect.fit_transform([head_f,body_f])
                    cosine = (tfidf * tfidf.T).todense().tolist()
                    if len(cosine) == 2:
                        stance_cosine.append(cosine[1][0])
                    else:
                        stance_cosine.append(0)
                else:
                    stance_cosine.append(0)
            named_cosine.append(stance_cosine)
        return named_cosine 
Example 57
Project: serverless-chatbots-workshop   Author: datteswararao   File: index.py    Apache License 2.0 5 votes vote down vote up
def handler(event, context):
    stop = set(stopwords.words('english'))
    punctuations = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
    sentence_without_punctuation = [i for i in word_tokenize(event['sentence']) if i not in punctuations]
    sentence_without_stopwords = [i for i in sentence_without_punctuation if i not in stop]
    stemmer = SnowballStemmer("english")
    stemmed_sentence = [stemmer.stem(i) for i in sentence_without_stopwords]
    print stemmed_sentence
    return {
        'sentence'  :   event['sentence'],
        'stemmed_sentence'  : ' '.join(stemmed_sentence)
    } 
Example 58
Project: RMDL   Author: eric-erki   File: text_feature_extraction.py    GNU General Public License v3.0 4 votes vote down vote up
def text_cleaner(text,
                 deep_clean=False,
                 stem= True,
                 stop_words=True,
                 translite_rate=True):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning

    ]

    if deep_clean:
        text = text.replace(".", "")
        text = text.replace("[", " ")
        text = text.replace(",", " ")
        text = text.replace("]", " ")
        text = text.replace("(", " ")
        text = text.replace(")", " ")
        text = text.replace("\"", "")
        text = text.replace("-", " ")
        text = text.replace("=", " ")
        text = text.replace("?", " ")
        text = text.replace("!", " ")

        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
        text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
        text = re.sub("(^|\W)\d+($|\W)", " ", text)
        if translite_rate:
            text = transliterate(text)
        if stem:
            text = PorterStemmer().stem(text)
        text = WordNetLemmatizer().lemmatize(text)
        if stop_words:
            stop_words = set(stopwords.words('english'))
            word_tokens = word_tokenize(text)
            text = [w for w in word_tokens if not w in stop_words]
            text = ' '.join(str(e) for e in text)
    else:
        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
    return text.lower() 
Example 59
Project: text-analysis-and-classification-gui-tool   Author: acrobotache   File: preprocessing.py    MIT License 4 votes vote down vote up
def process(self, comment):
        self.preprocessing_step_dict["sentence"] = comment

        # function call to remove 'HTML' tags from the comment
        comment = strip_html_tags(comment)

        # function call to remove accented characters from the comment
        comment = remove_accented_chars(comment)

        # function call to remove HTML tags from the comment
        comment = remove_numbers_from_comment(comment)

        # expands remained contraction or abbreviations if any
        comment = expand_contractions(comment)

        # function call to remove punctuations from the comment
        comment = strip_punctuation(comment)
        self.preprocessing_step_dict["noise_removed"] = comment

        # function call to replace abbreviations in the comment
        comment = replace_abbreviations(comment)
        self.preprocessing_step_dict["replace_abbr"] = comment

        # function call to remove URL from the comment
        comment = remove_url(comment)

        # tokenization
        comment = tokenize_comment(comment)

        # lowercase
        comment = lower_case_comment(comment)

        # perform Rapid keywords extraction algorithm
        comment = self.extract_keywords_using_rake(comment)
        self.preprocessing_step_dict["rake_output"] = comment

        # extract only essential words
        comment = extract_meaningful_words(comment)
        self.preprocessing_step_dict["meaningful_words"] = comment

        # Replace synonyms with root word
        comment = replace_synonyms(comment)

        # stem the comment
        comment = simple_stemming(comment)
        self.preprocessing_step_dict["stemmed_out"] = comment

        return comment 
Example 60
Project: ancile   Author: ancile-project   File: rdl.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def rdl_usage_data_recurrence(data):
    """
    Returns top 10 words in frequency from google searches within date ranges
    :param data:
    :return:
    """
    from datetime import  timedelta
    from dateutil.parser import parse as parse_date
    import nltk
    nltk.download('wordnet')
    nltk.download('stopwords')
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords
    stops = set(stopwords.words("english"))
    filtered_data = data['data']['usage']
    lemmatizer = WordNetLemmatizer()

    word_week_dict = {}  # Words to weeks
    for search_term in filtered_data:
        dt = parse_date(search_term[1]).date()
        week_dt = dt - timedelta(days=dt.weekday())  # Get Sunday
        word_list = search_term[0].split(' ')
        for word in word_list:
            lem_word = lemmatizer.lemmatize(word)
            if lem_word in stops:
                continue
            if lem_word in word_week_dict:
                word_week_dict[lem_word].add(week_dt)
            else:
                word_week_dict[lem_word] = {week_dt}

    # Get Minimum and Maximum weeks
    total_weeks = 1
    if data['data']['extents']['min'] is not None and data['data']['extents']['max'] is not None:
        dt = parse_date(data['data']['extents']['min'])
        dt = dt - timedelta(days=dt.weekday())  # Get Sunday
        dt = dt.date()  # Truncate to date
        end = parse_date(data['data']['extents']['max'])
        end = end - timedelta(days=end.weekday())
        end = end.date()
        total_weeks = (end - dt).days / 7
        if total_weeks == 0:
            total_weeks = 1

    word_recurrences = []
    for word in word_week_dict:
        word_recurrences.append({'lemma': word, 'recurrence': len(word_week_dict[word]) / total_weeks})
    word_recurrences.sort(reverse=True, key=lambda x: x['recurrence'])
    top_ten = word_recurrences[:10]
    data['output'].append('RDL Top Ten Recurrence Words Transform.')
    data['items'] = top_ten
    # Delete raw data
    del data['data']
    return data 
Example 61
Project: KontroleDegerMi   Author: doruksahin   File: iterative_detection.py    Apache License 2.0 4 votes vote down vote up
def writeFile(newsFile, num_topics, num_words, model, stem=""):
	sentence_array = []
	try:
		makedirs(newsFile[:-4])
	except:
		pass
	for i in range(1, num_topics + 1):
		print(model.print_topic((i - 1), num_words))
		sentence_array.append("({},\'{}\')".format(i - 1, model.print_topic(i - 1, num_words).encode("utf-8")).replace(" ", ""))

	file = open("./{}/{}num_topics={}-num_words={}.csv".format(newsFile[:-4], stem, num_topics, num_words), "w")
	for sentence in sentence_array:
		sentence_num = sentence.replace("[", "").replace("(", "")[0]
		for word in sentence[sentence.find("\'") + 1:sentence.find("\'", sentence.find("\'") + 1)].split("+"):
			score = word[0:word.find("*")]
			keyword = word[word.find("\"") + 1:len(word) - 1]
			file.write(sentence_num + ";" + keyword + ";" + score + "\n")

	topics = []
	for sentence in sentence_array:
		topic = {}
		sentence_num = sentence.replace("[", "").replace("(", "")[0]
		topic['topic_id'] = sentence_num
		topic['keywords'] = []
		for word in sentence[sentence.find("\'") + 1:sentence.find("\'", sentence.find("\'") + 1)].split("+"):
			word_info = {}
			score = word[0:word.find("*")]
			keyword = word[word.find("\"") + 1:len(word) - 1]
			word_info['keyword'] = keyword
			word_info['score'] = score
			topic['keywords'].append(word_info)
		topics.append(topic)
	with open("./{}/{}num_topics={}-num_words={}.json".format(newsFile[:-4], stem, num_topics, num_words).replace("\n", ""), "w+") as json_file:  # overwrites.
		json.dump(topics, json_file, indent=2, ensure_ascii=False)

	arr = []
	for sentence in sentence_array:
		sentenceArr = []
		words = sentence[sentence.find("\'") + 1:sentence.find("\'", sentence.find("\'") + 1)].split("+")
		for word in words:
			keyword = word[word.find("\"") + 1:len(word) - 1]
			sentenceArr.append(keyword)
		arr.append(sentenceArr)
	file = open("./{}/{}num_topics={}-num_words={}.txt".format(newsFile[:-4], stem, num_topics, num_words), 'wb')
	pickle.dump(arr, file)



# Verilen dosyayi stopwords'e eklemek. 
Example 62
Project: Plagiarism-Checker   Author: ieeecompsoc   File: bigram_check.py    MIT License 4 votes vote down vote up
def pre_processing(text):
        """ returns pre-processed text in text{0}

        :param text: text1
        :type text: str
        :returns: finall
        :rtype: str


"""
        sent_tokenize_list = sent_tokenize(text)
        #print sent_tokenize_list
        #print len(sent_tokenize_list)
        #tokenise words
        #print stop_words
        words=word_tokenize(text)
        stop_words = str(stopwords.words('english'))
        alpha=stop_words.replace("u'", "")
    #print words
        result = []
    #print alpha
    #remove stop words
        for item in words:
            if item not in alpha:
                result.append(item)
    #print "Filtered",result
        fil=str(result)
    #remove punctuation
        repstr=" " * 32
        table=string.maketrans(string.punctuation,repstr)
        s=fil.translate(table)
    #return s


    #lemmatizing
        lemmatizer=WordNetLemmatizer()
        h=lemmatizer.lemmatize(s)
    #print "Lemma",lemmatizer.lemmatize(s)
    #stemming
        wordss=word_tokenize(h)
        ps=PorterStemmer()
        list1=[]
        for i in wordss:
            k=(ps.stem(i))
            list1.append(k)
    #print list1
        final= '  '.join(list1)
        finall=str(final)
        return finall 
Example 63
Project: aop-helpFinder   Author: jecarvaill   File: tm_module.py    GNU General Public License v3.0 4 votes vote down vote up
def clean_abstract(abstract, case):
    """Method to clean and simplify an abstract by text mining process.
        1. Split abstract by sentences
        2. Split sentences by words
        3. Remove sentences which contain a negation word
        4. Remove stopwords from words in a sentence
        5. Stem process

    Return:
        abstract (str): cleaned and simplified abstract

    """
    # set list of tools
    negations = ['never', 'neither', 'no', 'none', 'nor', 'not', 'ain',
                 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven',
                 'isn', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn',
                 'weren', 'won', 'wouldn']
    punctuation = list(string.punctuation)
    stop = nltk.corpus.stopwords.words('english') + punctuation + ['\'s']

    # 1. split abstract by sentences
    sents = sent_tokenize(abstract)

    sents = [sent for sent in sents if not (bool(re.search('\d', sent) and
             'body weight' in sent))]

    # 2. split sentences by words
    abstract = [word_tokenize(sent) for sent in sents]

    # 3. remove sentences which contain a negation word
    abstract = [sent for sent in abstract if not
                any(negation in sent for negation in negations)]

    # 4. remove stopwords in sentences
    for i in range(len(abstract)):
        abstract[i] = [word for word in abstract[i] if word not in stop]

    # 5. stem process
    for i in range(len(abstract)):
        abstract[i] = stem_process(abstract[i])
        abstract[i] = ' '.join(abstract[i])

    # 6. AOD or KEr search case
    if case is True:
        abstract = ' '.join(abstract)
        return abstract
    else:
        return abstract