Python nltk.stem.snowball.SnowballStemmer() Examples

The following are code examples for showing how to use nltk.stem.snowball.SnowballStemmer(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: atap   Author: foxbook   File: converter.py    Apache License 2.0 7 votes vote down vote up
def conversion(source, dest):
    """
    :param source: the unit of measure you have
    :param dest: the unit of measure need to convert to
    :return:
    """
    stemmer = SnowballStemmer('english')
    source = stemmer.stem(source)
    dest = stemmer.stem(dest)

    try:
       units = conv_dict.get(source).get('Units')[
          conv_dict.get(source).get('Destination').index(dest)
       ]
    except:
       units = None

    return units, source, dest 
Example 2
Project: themarketingtechnologist   Author: thomhopmans   File: run.py    Apache License 2.0 6 votes vote down vote up
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nltk.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems 
Example 3
Project: spice-hate_speech_detection   Author: futurice   File: texttools.py    MIT License 6 votes vote down vote up
def stemming_message_snowball(message, stemmings_to_words=dict()):
    from nltk.stem.snowball import SnowballStemmer
    from nltk.tokenize import casual_tokenize
    stemmer = SnowballStemmer('finnish')

    if type(message) == None:
        return '', stemmings_to_words

    message.replace('#','')

    stemmed_message = []

    for word in casual_tokenize(message):

        stemmed_word = stemmer.stem(word.lower())
        stemmed_message.append(stemmed_word)
        stemmings_to_words[stemmed_word] = word

    stemmed_message = ' '.join(stemmed_message)

    return stemmed_message, stemmings_to_words 
Example 4
Project: Artificial-Intelligence-with-Python   Author: PacktPublishing   File: topic_modeler.py    MIT License 6 votes vote down vote up
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
Example 5
Project: Pesquisas   Author: danilopcarlotti   File: active_learning_logreg.py    Apache License 2.0 5 votes vote down vote up
def normalize_texts(self,texts,one_text=False):
        normal_texts = []
        tk = RegexpTokenizer(r'\w+')
        # stopwords = nltk.corpus.stopwords.words('portuguese')
        # stemmer = nltk.stem.RSLPStemmer()
        stopwords = nltk.corpus.stopwords.words('english')
        stemmer = SnowballStemmer("english")
        if one_text:
            texts = [texts]
        for t in texts:
            raw_text = t.lower()
            tokens = tk.tokenize(raw_text)
            tokenized_text = []
            for tkn in tokens:
                tkn = stemmer.stem(tkn)
                if tkn not in stopwords:
                    try:
                        float(tkn)
                    except:
                        tokenized_text.append(tkn)
            normal_texts.append(tokenized_text)
        # return normal_texts
        # tfidf dos textos
        vect = TfidfVectorizer()
        self.vect_fit = vect.fit(normal_texts)
        tfidf = self.vect_fit.transform(normal_texts)
        return tfidf.A 
Example 6
Project: razzy-spinner   Author: rafasashi   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 7
Project: razzy-spinner   Author: rafasashi   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 8
Project: razzy-spinner   Author: rafasashi   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 9
Project: razzy-spinner   Author: rafasashi   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 10
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 11
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 12
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 13
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 14
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 15
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 16
Project: OpenBottle   Author: xiaozhuchacha   File: test_stem.py    MIT License 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 17
Project: Health-Checker   Author: KriAga   File: test_stem.py    MIT License 5 votes vote down vote up
def test_arabic(self):
        """
        this unit testing for test the snowball arabic light stemmer
        this stemmer deals with prefixes and suffixes
        """
        ar_stemmer = SnowballStemmer("arabic")
        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("فالطالبات") == "طالب"
        assert ar_stemmer.stem("والطالبات") == "طالب"
        assert ar_stemmer.stem("الطالبون") == "طالب" 
Example 18
Project: Health-Checker   Author: KriAga   File: test_stem.py    MIT License 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 19
Project: Health-Checker   Author: KriAga   File: test_stem.py    MIT License 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 20
Project: Health-Checker   Author: KriAga   File: test_stem.py    MIT License 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 21
Project: Health-Checker   Author: KriAga   File: test_stem.py    MIT License 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 22
Project: Chrono   Author: AmyOlex   File: Chrono_createMLTrainingMatrix.py    GNU General Public License v3.0 5 votes vote down vote up
def extract_stem_feature(reftok, obs_dict, obs_list):
    my_str = reftok.getText().lower()
    for r in ["quarter","decades","decade","yesterday","yesterdays","today","todays","day","week","month","year","daily","weekly","monthly","yearly","century","minute","second","hour","hourly","days","weeks","months","years","centuries", "minutes","seconds","hours"]:
        idx = my_str.find(r)
        if(idx >= 0):
            obs_dict.update({r:0})
            obs_list.update({r:1})
            return(obs_list, obs_dict)

    stemmer = SnowballStemmer("english")
    #print(stemmer.stem(reftok.getText().lower()))
    obs_dict.update({stemmer.stem(reftok.getText().lower()): 0})
    obs_list.update({stemmer.stem(reftok.getText().lower()): 1})
    
    return(obs_list, obs_dict)
    
######
## END Function
######  


## This method determines if the target word has a numeric feature directly before or after it.
# @author Amy Olex
# @param reftok_list The list of reference tokens.
# @param reftok_idx The index of the target token that overlaps with the gold standard.
# @return Returns a boolean value of 1 if a numeric feature exists, 0 otherwise. 
Example 23
Project: chirp   Author: 9b   File: helpers.py    MIT License 5 votes vote down vote up
def cleaned_tokens(tokens):
    """Clean the tokens by removing stop words and stemming."""
    # stemmer = SnowballStemmer("english")
    # stemmed = [stemmer.stem(token) for token in tokens]
    s = set(stopwords.words('english'))
    tokens = [x.lower() for x in tokens if not x.isdigit()]
    return filter(lambda w: not w.lower() in s, tokens) 
Example 24
Project: message-author-classifier   Author: IvayloAtanasov   File: stem.py    MIT License 5 votes vote down vote up
def stem_message(message):
    # TODO: used a russian stemmer, english words remain unprocessed.
    # TODO: Pycache kicks in since we instantiate stemmer in a massive for loop. Fix?
    stemmer = SnowballStemmer("russian")
    # remove punctuation
    # ref 1: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
    # ref 2: https://stackoverflow.com/questions/23175809/typeerror-translate-takes-one-argument-2-given-python
    message = message.translate({ord(c): None for c in string.punctuation})
    # stem each word from message and compose again
    stemmed_words = [stemmer.stem(word) for word in message.split()]
    return str.join(' ', stemmed_words) 
Example 25
Project: ijcai2019-relis   Author: UKPLab   File: ner_cos_vector.py    MIT License 5 votes vote down vote up
def __init__(self,docs):
        self.documents = []
        self.sentences = []
        self.stoplist = list(stopwords.words(LANGUAGE))
        self.stemmer = SnowballStemmer(LANGUAGE)
        for doc in docs:
            self.documents.append(' '.join(doc[1]))
            self.sentences.extend(doc[1]) 
Example 26
Project: ijcai2019-relis   Author: UKPLab   File: state_type.py    MIT License 5 votes vote down vote up
def __init__(self, sum_token_length, base_length, sent_num, block_num, language):
        # hyper parameters
        self.reward_lambda = 0.9

        # learning arguments
        if sum_token_length != None:
            self.sum_token_length = sum_token_length
        else:
            self.sum_token_length = 99999

        self.state_length_computer = StateLengthComputer(block_num,base_length,sent_num)
        self.vec_length = self.state_length_computer.getTotalLength()
        self.summary_vector_length = self.state_length_computer.getStatesLength(block_num)
        self.language = language

        # stemmers and stop words list
        self.stemmer = SnowballStemmer(self.language)
        self.stoplist = set(stopwords.words(self.language))

        # class variables
        #self.draft_summary = ''
        self.draft_summary_list = []
        self.historical_actions = []
        self.available_sents = [i for i in range(0,sent_num+1)]
        self.terminal_state = 0 # 0 stands for non-terminal, and 1 stands for terminal
        self.draft_summary_length = 0

        #some flags/options
        self.newReward = False 
Example 27
Project: ijcai2019-relis   Author: UKPLab   File: cross_topic_sentence_vector.py    MIT License 5 votes vote down vote up
def __init__(self,docs):
        self.documents = []
        self.sentences = []
        self.stoplist = list(stopwords.words(LANGUAGE))
        self.stemmer = SnowballStemmer(LANGUAGE)
        for doc in docs:
            self.documents.append(' '.join(doc[1]))
            self.sentences.extend(doc[1]) 
Example 28
Project: api   Author: CheckFake   File: models.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def tokens(text) -> List[str]:
        root_words = []
        stemmer = SnowballStemmer("french")
        for i in range(len(text)):
            root_words.append(stemmer.stem(text[i]))
        return root_words 
Example 29
Project: research-summarization   Author: blendle   File: stemmer.py    ISC License 5 votes vote down vote up
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.stemmer = SnowballStemmer('english') 
Example 30
Project: FancyWord   Author: EastonLee   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 31
Project: FancyWord   Author: EastonLee   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 32
Project: FancyWord   Author: EastonLee   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 33
Project: FancyWord   Author: EastonLee   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 34
Project: texttk   Author: fmpr   File: texttk.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, decode_error='strict', strip_accents='unicode', ignore_list=[], lowercase=True, \
						remove_html=True, join_urls=True, use_bigrams=True, use_ner=True, stanford_ner_path="", \
						use_lemmatizer=False, max_df=0.95, min_df=1, max_features=None):
		self.stanford_ner_path = stanford_ner_path		# path to stanford NER
		self.decode_error = decode_error				# options: {‘strict’, ‘ignore’, ‘replace’}
		self.strip_accents = strip_accents				# options: {‘ascii’, ‘unicode’, None}
		self.ignore_list = ignore_list
		self.lowercase = lowercase
		self.remove_html = remove_html
		self.join_urls = join_urls	
		self.use_bigrams = use_bigrams
		self.use_ner = use_ner
		self.use_lemmatizer = use_lemmatizer			# use lemmatizer instead of stemmer?
		self.max_df = max_df							# maximum document frequency
		self.min_df = min_df							# remove terms that occur in less than min_df documents
		self.max_features = max_features 				# keep only top-N words according to tf across corpus

		self.sentence_splitter = PunktSentenceTokenizer().tokenize 		# Punkt sentence splitter
		self.stemmer = SnowballStemmer("english").stem					# Snowball stemmer
		self.lemmatizer = WordNetLemmatizer().lemmatize 				# WordNet lemmatizer
		self.base_tokenizer = CountVectorizer().build_tokenizer()		# sklearn tokenizer works the best, I think...
		self.stop_words = stopwords.words("english")					# nltk list of 128 stopwords
		self.token_pattern = re.compile(r'(?u)\b(\w*[a-zA-Z_]\w+|\w+[a-zA-Z_]\w*)\b') 	# default value was r'(?u)\b\w\w+\b'
		self.numeric_pattern = re.compile(r'^[0-9]+$')					# number regex
		self.url_pattern = re.compile(r'((http://)?(www\..*?\.\w+).*?)\s')
		self.compound_pattern = re.compile(r'\w+(\-\w+)+')
		if self.use_lemmatizer:
			self.tokenizer = CustomTokenizer(self.base_tokenizer, self.lemmatizer, self.token_pattern, self.numeric_pattern)
		else:
			self.tokenizer = CustomTokenizer(self.base_tokenizer, self.stemmer, self.token_pattern, self.numeric_pattern) 
Example 35
Project: Twitter-Personalities   Author: rohilshah95   File: pyPredict.py    GNU General Public License v3.0 5 votes vote down vote up
def preproc(s):
	#s=emoji_pattern.sub(r'', s) # no emoji
	s= unidecode(s)
	POSTagger=preprocess(s)
	#print(POSTagger)

	tweet=' '.join(POSTagger)
	stop_words = set(stopwords.words('english'))
	word_tokens = word_tokenize(tweet)
	#filtered_sentence = [w for w in word_tokens if not w in stop_words]
	filtered_sentence = []
	for w in POSTagger:
	    if w not in stop_words:
	        filtered_sentence.append(w)
	#print(word_tokens)
	#print(filtered_sentence)
	stemmed_sentence=[]
	stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
	for w in filtered_sentence:
		stemmed_sentence.append(stemmer2.stem(w))
	#print(stemmed_sentence)

	temp = ' '.join(c for c in stemmed_sentence if c not in string.punctuation) 
	preProcessed=temp.split(" ")
	final=[]
	for i in preProcessed:
		if i not in final:
			if i.isdigit():
				pass
			else:
				if 'http' not in i:
					final.append(i)
	temp1=' '.join(c for c in final)
	#print(preProcessed)
	return temp1 
Example 36
Project: support-tickets-classification   Author: karolzak   File: 2_train_and_eval_model.py    MIT License 5 votes vote down vote up
def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) 
Example 37
Project: hugo_similar_posts   Author: elbaulp   File: similar_posts.py    Apache License 2.0 5 votes vote down vote up
def tokenizer_snowball(text):
    stemmer = SnowballStemmer("spanish")
    return [stemmer.stem(word) for word in text.split() if word not in stop] 
Example 38
Project: Python-Machine-Learning-Cookbook-Second-Edition   Author: PacktPublishing   File: topic_modeling.py    MIT License 5 votes vote down vote up
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')
        
    # Tokenizing, stop word removal, and stemming 
Example 39
Project: cvscan   Author: skcript   File: language_parser.py    MIT License 5 votes vote down vote up
def clean_resume(resume_text):

  cleaned_resume = []

  # replacing newlines and punctuations with space
  resume_text =resume_text.replace('\t', ' ').replace('\n', ' ')
  for punctuation in string.punctuation:
    resume_text = resume_text.replace(punctuation, ' ')
  resume_text = resume_text.split()

  # removing stop words and Stemming the remaining words in the resume
  stemmer = SnowballStemmer("english")
  for word in resume_text:
    if word not in stopwords.words('english') and not word.isdigit():
      cleaned_resume.append(word.lower())#stemmer.stem(word))

  cleaned_resume = ' '.join(cleaned_resume)
  return cleaned_resume 
Example 40
Project: variant2literature   Author: ailabstw   File: pygnormplus.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self.tagger = CRFPP.Tagger("-m /app/models/GNR.Model")
        self.normalizer = GeneNormalizer()
        self.stemmer = SnowballStemmer('english')
        self.gene_dict = self.load_gene_symbols() 
Example 41
Project: variant2literature   Author: ailabstw   File: pytmvar.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self.tagger = CRFPP.Tagger("-m /app/models/MentionExtractionUB.Model")
        self.stemmer = SnowballStemmer('english')
        # self.pos_tagger = PerceptronTagger()
        self.regex_dna_mutation_str = utils.readlines('/app/models/tmvar_regexes/DNAMutation.RegEx.txt')
        self.regex_protein_mutation_str = utils.readlines('/app/models/tmvar_regexes/ProteinMutation.RegEx.txt')
        self.regex_snp_mutation_str = utils.readlines('/app/models/tmvar_regexes/SNP.RegEx.txt') 
Example 42
Project: nearduplicate   Author: vedmathai   File: readFiles.py    GNU General Public License v3.0 5 votes vote down vote up
def normalize(mypath):
    corpus={}
    corpus1={}
    corpus2={}
    corpus3={}
    corpus4={}
    corpus5={}
    for item in getFileNames(mypath):
        f=readFile(item[0]).split()
        stop = stopwords.words('english')
        stemmer = PorterStemmer()
        stemmer2 = SnowballStemmer("english")
        corpus[item[1]]=f
        corpus['desc']="Unfiltered direct corpus."
        corpus1[item[1]] = [stemmer2.stem(stemmer.stem(w)) for w in f]
        corpus1['desc']="Stopwords not removed and but stemmed first with Porter stemmer and then Snowball stemmer."
        corpus2[item[1]] = [w for w in f if w.lower() not in stop]
        corpus2['desc']="Stemming not done. But stopwords removed with nltk stopword list."
        corpus3[item[1]] = [stemmer2.stem(stemmer.stem(w)) for w in f if w.lower() not in stop]
        corpus3['desc']= "Stopwords removed and then both stemmers used."
        corpus4[item[1]] = [corpus3[item[1]][i]+" "+corpus3[item[1]][i+1] for i in range(len(corpus3[item[1]])-1)]
        corpus4['desc'] = "Stopwords removed and stemmed and then bigrams taken."
        corpus5[item[1]] = [corpus3[item[1]][i]+" "+corpus3[item[1]][i+1]+" " + corpus3[item[1]][i+2] for i in range(len(corpus3[item[1]])-2)]
        corpus5['desc'] = "Stopwords removed and stemmed and then trigrams taken."
        #print item, float(len(corpus[item]))/len(f)
    return [corpus, corpus1, corpus2, corpus3, corpus4, corpus5] 
Example 43
Project: Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition   Author: PacktPublishing   File: topic_modeling.py    MIT License 5 votes vote down vote up
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words
        self.english_stop_words= stopwords.words('english')

        # Create a Snowball stemmer
        self.snowball_stemmer = SnowballStemmer('english')

    # Tokenizing, stop word removal, and stemming 
Example 44
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 45
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 46
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 47
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 48
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 49
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 50
Project: honours_project   Author: JFriel   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 51
Project: nlppreprocess   Author: gaganmanku96   File: nlppreprocess.py    MIT License 5 votes vote down vote up
def __init__(self, remove_stopwords=True, replace_words=True,
                 remove_numbers=True, remove_html_tags=True,
                 remove_punctuations=True, lemmatize=False,
                 lemmatize_method='wordnet'):
        """
        This package contains functions that can help during the
        preprocessing of text data.
        :param remove_stopwords: boolean
            default value = True
        :param replace_words: boolean
            default value = True
        """
        if (type(remove_stopwords) != bool or
            type(replace_words) != bool or
            type(remove_numbers) != bool or
            type(remove_html_tags) != bool or
            type(remove_punctuations) != bool or
            type(lemmatize) != bool):
            raise Exception("Error - expecting a boolean parameter")
        if lemmatize_method not in ['wordnet', 'snowball']:
            raise Exception("Error - lemmatizer method not supported")
        self.doc = None
        self.lemmatizer = None
        self.remove_stopwords = remove_stopwords
        self.replace_words = replace_words
        self.remove_numbers = remove_numbers
        self.remove_html_tags = remove_html_tags
        self.remove_punctations = remove_punctuations
        self.lemmatize_method = lemmatize_method
        self.lemmatize = lemmatize
        self.stopword_list = set(stopwords)
        self.replacement_list = to_replace
        if self.lemmatize_method == 'wordnet':
            self.lemmatizer = WordNetLemmatizer()
        if self.lemmatize_method == 'snowball':
            self.lemmatizer = SnowballStemmer('english') 
Example 52
Project: TextRank   Author: naiveHobo   File: preprocessing.py    MIT License 5 votes vote down vote up
def __init__(self):
        self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt")
        self.LEMMATIZER = WordNetLemmatizer()
        self.STEMMER = SnowballStemmer("english")
        self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
        self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
        self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 
Example 53
Project: Topic-Distance-and-Coherence   Author: Renata1995   File: MySentenceStemmer.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, st):
		self.sstem = SnowballStemmer("english")
		self.wstem = WordNetLemmatizer()
	 	self.st = st; 
Example 54
Project: Topik   Author: saurabh-deochake   File: query_processing.py    GNU General Public License v3.0 5 votes vote down vote up
def processQuery(self):
		stop = stopwords.words('english')
		stemmer = SnowballStemmer("english")
        
		words = [i for i in self.text.split() if i not in stop]
		
		stemwords = []
		for w in words:
			stemwords.append(stemmer.stem(w))
        
		self.processedText = " ".join(stemwords)
		
		return 
Example 55
Project: sentiment-analyzer   Author: avsek477   File: preprocessor.py    MIT License 5 votes vote down vote up
def stemma(review):
    stemmer = SnowballStemmer("english")
    stemmedData = []
    word_tokens = word_tokenize(review)
    for word in word_tokens:
        stemmedData.append(stemmer.stem(word))
    return " ".join(stemmedData) 
Example 56
Project: aop-helpFinder   Author: jecarvaill   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_arabic(self):
        """
        this unit testing for test the snowball arabic light stemmer
        this stemmer deals with prefixes and suffixes
        """
        ar_stemmer = SnowballStemmer("arabic")
        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("فالطالبات") == "طالب"
        assert ar_stemmer.stem("والطالبات") == "طالب"
        assert ar_stemmer.stem("الطالبون") == "طالب" 
Example 57
Project: aop-helpFinder   Author: jecarvaill   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 58
Project: aop-helpFinder   Author: jecarvaill   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 59
Project: aop-helpFinder   Author: jecarvaill   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 60
Project: aop-helpFinder   Author: jecarvaill   File: test_stem.py    GNU General Public License v3.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 61
Project: CLEF2018-VQA-Med   Author: youngzhou97qz   File: unfinished_pytorch.py    MIT License 5 votes vote down vote up
def compute_bleu(self, predictions):
        warnings.filterwarnings('ignore')
        nltk.download('punkt')
        nltk.download('stopwords')
        stops = set(stopwords.words("english"))
        stemmer = SnowballStemmer("english")
        translator = str.maketrans('', '', string.punctuation)
        candidate_pairs = self.readresult(predictions)
        gt_pairs = self.readresult(self.gt)
        max_score = len(gt_pairs)
        current_score = 0
        i = 0
        for image_key in candidate_pairs:
            candidate_caption = candidate_pairs[image_key]
            gt_caption = gt_pairs[image_key]
            if not VqaMedEvaluator.case_sensitive:
                candidate_caption = candidate_caption.lower()
                gt_caption = gt_caption.lower()
            candidate_words = nltk.tokenize.word_tokenize(candidate_caption.translate(translator))
            gt_words = nltk.tokenize.word_tokenize(gt_caption.translate(translator))
            if VqaMedEvaluator.remove_stopwords:
                candidate_words = [word for word in candidate_words if word.lower() not in stops]
                gt_words = [word for word in gt_words if word.lower() not in stops]
            if VqaMedEvaluator.stemming:
                candidate_words = [stemmer.stem(word) for word in candidate_words]
                gt_words = [stemmer.stem(word) for word in gt_words]
            try:
                if len(gt_words) == 0 and len(candidate_words) == 0:
                    bleu_score = 1
                else:
                    bleu_score = nltk.translate.bleu_score.sentence_bleu([gt_words], candidate_words, smoothing_function=SmoothingFunction().method0)
            except ZeroDivisionError:
                pass
            current_score += bleu_score
        return current_score / max_score 
Example 62
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 63
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example 64
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 65
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 66
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example 67
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example 68
Project: serverless-chatbots-workshop   Author: datteswararao   File: test_stem.py    Apache License 2.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example 69
Project: Quora-Question-Pairs   Author: rupak-118   File: MaLSTM_train.py    MIT License 4 votes vote down vote up
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    
    return corpus 
Example 70
Project: Quora-Question-Pairs   Author: rupak-118   File: test.py    MIT License 4 votes vote down vote up
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
        
    return corpus 
Example 71
Project: dd-genomics   Author: HazyResearch   File: hpoterms2mentions.py    Apache License 2.0 4 votes vote down vote up
def main():
    # Load the dictionaries we need
    stopwords_dict = load_dict("stopwords")
    hpoterm_phenotype_abnormalities = load_dict(
        "hpoterm_phenotype_abnormalities")
    # Load the stemmer from NLTK
    stemmer = SnowballStemmer("english")
    if len(sys.argv) != 2:
        sys.stderr.write("USAGE: {} DICT\n".format(sys.argv[0]))
        sys.exit(1)
    with open(sys.argv[1], 'rt') as dict_file:
        for line in dict_file:
            # Skip empty lines
            if line.strip() == "":
                continue
            hpo_id, name, definition = line.strip().split("\t")
            # Skip if this is not a phenotypic abnormality
            if hpo_id not in hpoterm_phenotype_abnormalities:
                    continue
            tokens = name.split()
            if len(tokens) == 1:
                name_stems = [tokens[0].casefold(), ]
            else:
                # Compute the stems of the name
                name_stems = set()
                for word in tokens:
                    # Remove parenthesis and commas and colons
                    if word[0] == "(":
                        word = word[1:]
                    if word[-1] == ")":
                        word = word[:-1]
                    if word[-1] == ",":
                        word = word[:-1]
                    if word[-1] == ":":
                        word = word[:-1]
                    # Only process non stop-words AND single letters
                    if (word.casefold() not in stopwords_dict and word not in
                            ORDINALS) or len(word) == 1:
                        # split words that contain a "/"
                        if word.find("/") != - 1:
                            for part in word.split("/"):
                                name_stems.add(stemmer.stem(part))
                        else:
                            name_stems.add(stemmer.stem(word))
            print("\t".join([hpo_id, name, "|".join(name_stems)])) 
Example 72
Project: Identificador-Fraude-Enron   Author: luisneto98   File: parse_out_email_text.py    MIT License 4 votes vote down vote up
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### project part 2: comment out the line below
        #words = text_string


        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        text_vector = text_string.split()
        stemmer = SnowballStemmer("english")
        words = ""
        sw = stopwords.words("english")
        
        for i in range(len(text_vector)):
            text_vector[i] = stemmer.stem(text_vector[i])

        for i in range(len(text_vector)):
            words = words + text_vector[i]
            if(i != len(text_vector)-1):
                words = words + " "
    return words.encode() 
Example 73
Project: HOTT   Author: IBM   File: data.py    MIT License 4 votes vote down vote up
def reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean'):
    """Reduce vocabulary size by stemming and removing stop words.
    """
    vocab = np.array(vocab)
    short = np.array([len(w) > 2 for w in vocab])
    stop_words = set(stopwords.words('english'))
    stop = np.array([w not in stop_words for w in vocab])
    reduced_vocab = vocab[np.logical_and(short, stop)]
    reduced_bow_data = bow_data[:, np.logical_and(short, stop)]
    stemmer = SnowballStemmer("english")
    stemmed_dict = {}
    stemmed_idx_mapping = {}
    stemmed_vocab = []
    for i, w in enumerate(reduced_vocab):
        stem_w = stemmer.stem(w)
        if stem_w in stemmed_vocab:
            stemmed_dict[stem_w].append(w)
            stemmed_idx_mapping[stemmed_vocab.index(stem_w)].append(i)
        else:
            stemmed_dict[stem_w] = [w]
            stemmed_vocab.append(stem_w)
            stemmed_idx_mapping[stemmed_vocab.index(stem_w)] = [i]

    stemmed_bow_data = np.zeros((bow_data.shape[0], len(stemmed_vocab)),
                                dtype=np.int)
    for i in range(len(stemmed_vocab)):
        stemmed_bow_data[:, i] = reduced_bow_data[:, stemmed_idx_mapping[i]].sum(axis=1).flatten()

    word_counts = stemmed_bow_data.sum(axis=0)
    stemmed_reduced_vocab = np.array(stemmed_vocab)[word_counts > 2].tolist()
    stemmed_reduced_bow_data = stemmed_bow_data[:, word_counts > 2]

    stemmed_reduced_embed_vocab = {}
    for w in stemmed_reduced_vocab:
        old_w_embed = [embed_vocab[w_old] for w_old in stemmed_dict[w]]
        if embed_aggregate == 'mean':
            new_w_embed = np.mean(old_w_embed, axis=0)
        elif embed_aggregate == 'first':
            new_w_embed = old_w_embed[0]
        else:
            print('Unknown embedding aggregation')
            break
        stemmed_reduced_embed_vocab[w] = new_w_embed

    return (stemmed_reduced_vocab,
            stemmed_reduced_embed_vocab,
            stemmed_reduced_bow_data) 
Example 74
Project: Movie-Recommendation   Author: pncnmnp   File: content_based.py    MIT License 4 votes vote down vote up
def make_keywords(self, df):
		"""
            param: df - movies pandas DataFrame

            return: pandas DataFrame with attribute 'all_keys', 
                    which combines crew andcast members, movie-keywords, genres.
		"""
		stemmer = SnowballStemmer("english")
		df["keywords"] = (
			df["keywords"]
			.apply(literal_eval)
			.apply(
				lambda keywords: [stemmer.stem(k["name"]) for k in keywords]
				if isinstance(keywords, list)
				else list()
			)
		)
		df = df.merge(self.md_credits, on="id")
		df["cast"] = (
			df["cast"]
			.apply(literal_eval)
			.apply(
				lambda actors: [
					# To count actor name as one word like 'tomcruise'
					actor["name"].lower().replace(" ", "")
					for actor in actors[:ACTOR_LIMIT]
				]
				if isinstance(actors, list)
				else list()
			)
		)
		df["crew"] = (
			df["crew"]
			.apply(literal_eval)
			.apply(
				lambda crews: [
					# To count director name as one word like "stanleykubrick"
					crew["name"].lower().replace(" ", "")
					for crew in crews
					if crew["job"] in CREW
				]
				if isinstance(crews, list)
				else list()
			)
		)

		df["all_keys"] = (
			df["keywords"] + df["cast"] * CAST_WT + df["crew"] * CREW_WT + df["genres"]
		)
		df["all_keys"] = df["all_keys"].apply(
			lambda keywords: " ".join(keywords) if isinstance(keywords, list) else str()
		)

		return df