Python nltk.stem.snowball.SnowballStemmer() Examples

The following are 30 code examples of nltk.stem.snowball.SnowballStemmer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.stem.snowball , or try the search function .
Example #1
Source File: topic_modeler.py    From Artificial-Intelligence-with-Python with MIT License 6 votes vote down vote up
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
Example #2
Source File: texttools.py    From spice-hate_speech_detection with MIT License 6 votes vote down vote up
def stemming_message_snowball(message, stemmings_to_words=dict()):
    from nltk.stem.snowball import SnowballStemmer
    from nltk.tokenize import casual_tokenize
    stemmer = SnowballStemmer('finnish')

    if type(message) == None:
        return '', stemmings_to_words

    message.replace('#','')

    stemmed_message = []

    for word in casual_tokenize(message):

        stemmed_word = stemmer.stem(word.lower())
        stemmed_message.append(stemmed_word)
        stemmings_to_words[stemmed_word] = word

    stemmed_message = ' '.join(stemmed_message)

    return stemmed_message, stemmings_to_words 
Example #3
Source File: test_stem.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen' 
Example #4
Source File: test_stem.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def test_russian(self):
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк" 
Example #5
Source File: test_stem.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def test_arabic(self):
        """
        this unit testing for test the snowball arabic light stemmer
        this stemmer deals with prefixes and suffixes
        """
        # Test where the ignore_stopwords=True.
        ar_stemmer = SnowballStemmer("arabic", True)
        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("فالطالبات") == "طالب"
        assert ar_stemmer.stem("والطالبات") == "طالب"
        assert ar_stemmer.stem("الطالبون") == "طالب"
        assert ar_stemmer.stem("اللذان") == "اللذان"
        assert ar_stemmer.stem("من") == "من"
        # Test where the ignore_stopwords=False.
        ar_stemmer = SnowballStemmer("arabic", False)
        assert ar_stemmer.stem("اللذان") == "اللذ"  # this is a stop word
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("الكلمات") == "كلم"
        # test where create the arabic stemmer without given init value to ignore_stopwords
        ar_stemmer = SnowballStemmer("arabic")
        assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
        assert ar_stemmer.stem("العربية") == "عرب"
        assert ar_stemmer.stem("فقالوا") == "قال"
        assert ar_stemmer.stem("الطالبات") == "طالب"
        assert ar_stemmer.stem("الكلمات") == "كلم" 
Example #6
Source File: stemmer.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def nltk_stemmer(stemmer, token, i=None, tokens=None):
    """Wrapper around a NLTK SnowballStemmer, which includes stop words for
    each language.

    Args:
        stemmer (SnowballStemmer): Stemmer instance that performs the stemming.
        token (lunr.Token): The token to stem.
        i (int): The index of the token in a set.
        tokens (list): A list of tokens representing the set.
    """

    def wrapped_stem(token, metadata=None):
        return stemmer.stem(token)

    return token.update(wrapped_stem) 
Example #7
Source File: stemmer.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def get_language_stemmer(language):
    """Retrieves the SnowballStemmer for a particular language.

    Args:
        language (str): ISO-639-1 code of the language.
    """
    from lunr.languages import SUPPORTED_LANGUAGES
    from nltk.stem.snowball import SnowballStemmer

    return SnowballStemmer(SUPPORTED_LANGUAGES[language]) 
Example #8
Source File: concept_based.py    From acl2017-interactive_summarizer with Apache License 2.0 6 votes vote down vote up
def __init__(self, input_directory, language):
        """
        Args:
            input_directory (str): the directory from which text documents to
              be summarized are loaded.

        @type language: str

        """
        self.input_directory = input_directory
        self.sentences = []
        self.weights = {}
        self.c2s = defaultdict(set)
        self.concept_sets = defaultdict(frozenset)
        self.LANGUAGE = language
        # type: str

        self.stoplist = set(stopwords.words(self.LANGUAGE))
        self.stemmer = SnowballStemmer(self.LANGUAGE)

        self.word_frequencies = defaultdict(int)
        self.w2s = defaultdict(set) 
Example #9
Source File: run.py    From themarketingtechnologist with Apache License 2.0 6 votes vote down vote up
def tokenize(text):
        """
        Tokenizes sequences of text and stems the tokens.
        :param text: String to tokenize
        :return: List with stemmed tokens
        """
        tokens = nltk.WhitespaceTokenizer().tokenize(text)
        tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
        tokens = [word for word in tokens if word not in stopwords.words('english')]
        tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
        stems = []
        stemmer = SnowballStemmer("english")
        for token in tokens:
            token = stemmer.stem(token)
            if token != "":
                stems.append(token)
        return stems 
Example #10
Source File: language_parser.py    From cvscan with MIT License 6 votes vote down vote up
def clean_resume(resume_text):

  cleaned_resume = []

  # replacing newlines and punctuations with space
  resume_text =resume_text.replace('\t', ' ').replace('\n', ' ')
  for punctuation in string.punctuation:
    resume_text = resume_text.replace(punctuation, ' ')
  resume_text = resume_text.split()

  # removing stop words and Stemming the remaining words in the resume
  stemmer = SnowballStemmer("english")
  for word in resume_text:
    if word not in stopwords.words('english') and not word.isdigit():
      cleaned_resume.append(word.lower())#stemmer.stem(word))

  cleaned_resume = ' '.join(cleaned_resume)
  return cleaned_resume 
Example #11
Source File: converter.py    From atap with Apache License 2.0 6 votes vote down vote up
def conversion(source, dest):
    """
    :param source: the unit of measure you have
    :param dest: the unit of measure need to convert to
    :return:
    """
    stemmer = SnowballStemmer('english')
    source = stemmer.stem(source)
    dest = stemmer.stem(dest)

    try:
       units = conv_dict.get(source).get('Units')[
          conv_dict.get(source).get('Destination').index(dest)
       ]
    except:
       units = None

    return units, source, dest 
Example #12
Source File: upper_bound_ilp.py    From acl2017-interactive_summarizer with Apache License 2.0 5 votes vote down vote up
def __init__(self, language):
        self.sentences = []
        self.docs = []
        self.models = []
        self.doc_sent_dict = {}
        self.ref_ngrams = []
        self.LANGUAGE = language
        self.stemmer = SnowballStemmer(self.LANGUAGE)
        self.stoplist = set(stopwords.words(self.LANGUAGE)) 
Example #13
Source File: sume_wrap.py    From acl2017-interactive_summarizer with Apache License 2.0 5 votes vote down vote up
def __init__(self, language):
        self.s = sume.ConceptBasedILPSummarizer(" ", language)
        self.LANGUAGE = language
        self.stoplist = set(stopwords.words(self.LANGUAGE))
        self.stemmer = SnowballStemmer(self.LANGUAGE) 
Example #14
Source File: simulated_feedback.py    From acl2017-interactive_summarizer with Apache License 2.0 5 votes vote down vote up
def __init__(self, language, rouge, embeddings={}, fvector=[], ngrams_size=2, top_n=100, dump_base_dir=tempfile.mkdtemp(prefix="simufee-")):
        '''
        Initialize the docs and models structure
        '''
        self.Oracle = Oracle()  # oracle
        self.SumeWrap = SumeWrap(language) # only used to load the sentences and push them into self.summarizer
        self.summarizer = sume.ConceptBasedILPSummarizer(" ", language)
        self.N = ngrams_size # how many words an should the ngrams consist of
        self.top_n = top_n  # currently unused
        self.ref_ngrams = set() # set of ngrams that are in the reference summaries (for the feedback to peek)
        self.ref_phrases = set() # set of phrases that are in the reference summaries (for the feedback to peek)

        self.flight_recorder = FlightRecorder()  # The flight-recorder stores all interactions wrt to concepts (eg. accepted, and rejected)

        self.info_data = [] # stats for the pipeline. The only thing that leaves this class
        self.initial_weights = {} # oracle reweighting
        self.language = language # document language. relevant for stemmer, embeddings, stopwords, parsing
        #self.stemmer = SnowballStemmer(self.language)
        if self.language == "english":
            self.stemmer = SnowballStemmer(self.language)
            #elf.stemmer = WordNetLemmatizer()
        else:
            self.stemmer = SnowballStemmer(self.language)
        self.stoplist = set(stopwords.words(self.language))
        self.rouge = rouge
        self.cluster_size = 0.0
        self.embeddings = embeddings # word2vec embeddings
        self.fvector = fvector # List of support vectors for active learning SVM
        self.pos_hash = {} # active learning // SVM
        self.concept_vec_idx = {} # active learning // SVM
        self.index_vec_concept = {} # active learning // SVM

        ### previously uninitialized fields...
        self.data = None # np.array(self.fvector)   # active learning // SVM TODO rename self.data to somehting that contains svm...
        self.labels = None # active learning // SVM
        self.MAX_WEIGHT = None # int with # of documents (i.e. largest possible DF value)
        self.models = None # reference summaries, only needed for rouge score (as they are converted merged into one large summary)
        self.parse_type = None # None or "parse"
        self.prev_score = None # rouge scores of previous iteration.
        self.score = None # rouge scores of current iteration.
        self.summary_length = None # target summary length.
        self.ub_score = None # rouge scores of upper bound
        self.uncertainity = {} # active learning // SVM

        # graph based propagation settings
        self.graph = PageRankFeedbackGraph(self.stemmer, self.language)
        # self.graph = SimpleNgramFeedbackGraph(self.stemmer, self.language, N=5)
        self.debug_dump_target_dir = dump_base_dir
        self.allowed_number_of_feedback_per_iteration=5 
Example #15
Source File: preprocessing.py    From TextRank with MIT License 5 votes vote down vote up
def __init__(self):
        self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt")
        self.LEMMATIZER = WordNetLemmatizer()
        self.STEMMER = SnowballStemmer("english")
        self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
        self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE)
        self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 
Example #16
Source File: data.py    From HOTT with MIT License 5 votes vote down vote up
def reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean'):
    """Reduce vocabulary size by stemming and removing stop words.
    """
    vocab = np.array(vocab)
    short = np.array([len(w) > 2 for w in vocab])
    stop_words = set(stopwords.words('english'))
    stop = np.array([w not in stop_words for w in vocab])
    reduced_vocab = vocab[np.logical_and(short, stop)]
    reduced_bow_data = bow_data[:, np.logical_and(short, stop)]
    stemmer = SnowballStemmer("english")
    stemmed_dict = {}
    stemmed_idx_mapping = {}
    stemmed_vocab = []
    for i, w in enumerate(reduced_vocab):
        stem_w = stemmer.stem(w)
        if stem_w in stemmed_vocab:
            stemmed_dict[stem_w].append(w)
            stemmed_idx_mapping[stemmed_vocab.index(stem_w)].append(i)
        else:
            stemmed_dict[stem_w] = [w]
            stemmed_vocab.append(stem_w)
            stemmed_idx_mapping[stemmed_vocab.index(stem_w)] = [i]

    stemmed_bow_data = np.zeros((bow_data.shape[0], len(stemmed_vocab)),
                                dtype=np.int)
    for i in range(len(stemmed_vocab)):
        stemmed_bow_data[:, i] = reduced_bow_data[:, stemmed_idx_mapping[i]].sum(axis=1).flatten()

    word_counts = stemmed_bow_data.sum(axis=0)
    stemmed_reduced_vocab = np.array(stemmed_vocab)[word_counts > 2].tolist()
    stemmed_reduced_bow_data = stemmed_bow_data[:, word_counts > 2]

    stemmed_reduced_embed_vocab = {}
    for w in stemmed_reduced_vocab:
        old_w_embed = [embed_vocab[w_old] for w_old in stemmed_dict[w]]
        if embed_aggregate == 'mean':
            new_w_embed = np.mean(old_w_embed, axis=0)
        elif embed_aggregate == 'first':
            new_w_embed = old_w_embed[0]
        else:
            print('Unknown embedding aggregation')
            break
        stemmed_reduced_embed_vocab[w] = new_w_embed

    return (stemmed_reduced_vocab,
            stemmed_reduced_embed_vocab,
            stemmed_reduced_bow_data) 
Example #17
Source File: test_stem.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example #18
Source File: parse_out_email_text.py    From machine-learning with GNU General Public License v3.0 5 votes vote down vote up
def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """

    f.seek(0)  # go back to beginning of file (annoying)
    all_text = f.read()
    # split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        # remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
        
        # project part 2: comment out the line below
        #words = text_string

        # split the text string into individual words, stem each word,
        stemmer = SnowballStemmer('english')
        text_string = text_string.split() #makes a list of words
        for i in range(len(text_string)):
            text_string[i] = stemmer.stem(text_string[i])
        # and append the stemmed word to words (make sure there's a single
        # space between each stemmed word)
            
        words = " ".join(text_string)# this -> " " ensures space b/w words        

    return words 
Example #19
Source File: test_stem.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def test_russian(self):
        # Russian words both consisting of Cyrillic
        # and Roman letters can be stemmed.
        stemmer_russian = SnowballStemmer("russian")
        assert stemmer_russian.stem("авантненькая") == "авантненьк"
        assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" 
Example #20
Source File: topic_modeling.py    From Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition with MIT License 5 votes vote down vote up
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words
        self.english_stop_words= stopwords.words('english')

        # Create a Snowball stemmer
        self.snowball_stemmer = SnowballStemmer('english')

    # Tokenizing, stop word removal, and stemming 
Example #21
Source File: Auto_NLP.py    From Auto_ViML with Apache License 2.0 5 votes vote down vote up
def tokenize_and_stem(text):
    stemmer = SnowballStemmer("english")
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
################################################################################ 
Example #22
Source File: topic_modeling.py    From Python-Machine-Learning-Cookbook-Second-Edition with MIT License 5 votes vote down vote up
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')
        
    # Tokenizing, stop word removal, and stemming 
Example #23
Source File: 2_train_and_eval_model.py    From support-tickets-classification with MIT License 5 votes vote down vote up
def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) 
Example #24
Source File: helpers.py    From chirp with MIT License 5 votes vote down vote up
def cleaned_tokens(tokens):
    """Clean the tokens by removing stop words and stemming."""
    # stemmer = SnowballStemmer("english")
    # stemmed = [stemmer.stem(token) for token in tokens]
    s = set(stopwords.words('english'))
    tokens = [x.lower() for x in tokens if not x.isdigit()]
    return filter(lambda w: not w.lower() in s, tokens) 
Example #25
Source File: preprocessing.py    From Projects with MIT License 5 votes vote down vote up
def __init__(self,bigrams=True,min_df=3,stemming=True,tfidf=True):
        self.regex = re.compile('[^a-zA-Z ]')
        self.stop = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.bigrams = bigrams
        self.min_df = min_df
        self.stemming = stemming
        self.tfidf = tfidf 
Example #26
Source File: test.py    From Quora-Question-Pairs with MIT License 5 votes vote down vote up
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
        
    return corpus 
Example #27
Source File: MaLSTM_train.py    From Quora-Question-Pairs with MIT License 5 votes vote down vote up
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    
    return corpus 
Example #28
Source File: test_stem.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def test_short_strings_bug(self):
        stemmer = SnowballStemmer('english')
        assert stemmer.stem("y's") == 'y' 
Example #29
Source File: test_stem.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def test_spanish(self):
        stemmer = SnowballStemmer('spanish')

        assert stemmer.stem("Visionado") == 'vision'

        # The word 'algue' was raising an IndexError
        assert stemmer.stem("algue") == 'algu' 
Example #30
Source File: test_stem.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def test_german(self):
        stemmer_german = SnowballStemmer("german")
        stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True)

        assert stemmer_german.stem("Schr\xe4nke") == 'schrank'
        assert stemmer_german2.stem("Schr\xe4nke") == 'schrank'

        assert stemmer_german.stem("keinen") == 'kein'
        assert stemmer_german2.stem("keinen") == 'keinen'