Python stop_words.get_stop_words() Examples

The following are 17 code examples of stop_words.get_stop_words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module stop_words , or try the search function .
Example #1
Source File: lex_sem_ft.py    From DL-text with MIT License 6 votes vote down vote up
def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float): 
Example #2
Source File: lex_sem_ft.py    From DeepLearn with MIT License 6 votes vote down vote up
def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float): 
Example #3
Source File: scrape2.py    From Web-Scraping with MIT License 6 votes vote down vote up
def clean_up_words(words):
    new_words = [] # empty list
    pkg_stop_words = get_stop_words('en')
    my_stop_words = [
                'the', 
                'is', 
                'and', 
                'thisfacebooktwitteremailredditprint', 
                '',
                'reply',
                'likelike',
                'likeliked',
                'comments',
                'commenting',
                '/',
                '='
                ]
    for word in words:
        word = word.lower()
        cleaned_word = clean_word(word)
        if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words:
            pass
        else:
            new_words.append(cleaned_word)
    return new_words 
Example #4
Source File: eval_utils.py    From dataset_agnostic_segmentation with MIT License 5 votes vote down vote up
def phoc_spottig(it, folder, ignore_stop_words=False, iou_thresh=0.5, dont_load=False, use_gt_phoc=False, filter_small=False, logger=None, max_word_num=None):
    """
    Evaluate mAP for phoc based word spotting using an evaluation folder (contating *.json) produced by main.py script.
    assumes folder contains json files with {'word_%d': {'gt': gt_box, 'text': word annotation, 'cover': IoU of predicted word with GT box,
                                                          'gt_phoc': PHOC for text, 'pre_phoc': predicted PHOC}}

    Performs two tasks:
        (1) Prepare query words
        (2) Call query function on query words
    """
    # Create all query words - based on test set ground truth
    if logger is None:
        logger = print
    qwords = []
    for page in it:
        # Some of the datasest (e.g. IAMDB) have words with bad annotations, those are ignored by the eval protocol
        words = page.get_good_words_and_boxes_idx()[1]
        qwords.extend(words)
    qwords = set(qwords)

    logger('Query Words %d' % len(qwords))
    if ignore_stop_words:
        # If there are stop words to be removed...
        qwords = qwords - set(get_stop_words('en'))
        logger('Without stop words %d' % len(qwords))

    if max_word_num is not None:
        # Subsample all queries to partially evaluate
        if len(qwords) > max_word_num:
            idx = np.random.choice(range(len(qwords)), max_word_num, replace=False)
            qwords = set(np.array(list(qwords))[idx])
            logger('Sampled %d words' % len(qwords))

    qtimer = Timer()
    qtimer.tic()
    mAP, recall, accuracy = query_page_folder_phoc(qwords, folder, threshold=iou_thresh, dont_load=dont_load, use_gt_phoc=use_gt_phoc, filter_small=filter_small)
    logger('Finished after %d secs mAP %4.2f Recall %4.2f Accuracy %4.2f' % (qtimer.toc(), mAP*100, recall*100, accuracy*100))
    return 
Example #5
Source File: lda_model_calculator.py    From moviegeek with MIT License 5 votes vote down vote up
def remove_stopwords(tokenized_data):

        en_stop = get_stop_words('en')

        stopped_tokens = [token for token in tokenized_data if token not in en_stop]
        return stopped_tokens 
Example #6
Source File: tests.py    From python-stop-words with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_filters(self):
            language = 'en'
            before = get_stop_words(language, False)
            letter = random.choice(random.choice(before))

            def remove_letter(stopwords, language):
                return [word for word in stopwords if letter not in word]
            stop_words.add_filter(remove_letter)
            after = get_stop_words(language, False)
            for stopword in after:
                self.assertFalse(letter in stopword)
            self.assertTrue(stop_words.remove_filter(remove_letter)) 
Example #7
Source File: tests.py    From python-stop-words with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_get_stop_words_install_issue(self):
        original_stop_words_dir = stop_words.STOP_WORDS_DIR
        stop_words.STOP_WORDS_DIR = 'not-existing-directory'
        self.assertRaises(StopWordError, get_stop_words, 'german')
        stop_words.STOP_WORDS_DIR = original_stop_words_dir 
Example #8
Source File: tests.py    From python-stop-words with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_get_stop_words_unavailable_language(self):
        self.assertRaises(StopWordError, get_stop_words, 'sindarin') 
Example #9
Source File: tests.py    From python-stop-words with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_get_stop_words_cache(self):
        self.assertFalse('french' in stop_words.STOP_WORDS_CACHE)
        sw = get_stop_words('fr')
        self.assertTrue('french' in stop_words.STOP_WORDS_CACHE)
        original_stop_words_dir = stop_words.STOP_WORDS_DIR
        stop_words.STOP_WORDS_DIR = 'not-existing-directory'
        self.assertEqual(sw, get_stop_words('french'))
        stop_words.STOP_WORDS_DIR = original_stop_words_dir
        try:
            get_stop_words('klingon')
        except:
            pass
        self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE) 
Example #10
Source File: tests.py    From python-stop-words with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_get_stop_words_language_mapping(self):
        sw = get_stop_words('en')
        self.assertEqual(len(sw), self.number_of_english_stop_words)
        self.assertEqual(sw, get_stop_words('english')) 
Example #11
Source File: tests.py    From python-stop-words with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_get_stop_words(self):
        sw = get_stop_words('english')
        self.assertEqual(len(sw), self.number_of_english_stop_words) 
Example #12
Source File: index.py    From acl-anthology with Apache License 2.0 5 votes vote down vote up
def load_stopwords(language):
    return [t for w in get_stop_words(language) for t in slugify(w).split("-")] 
Example #13
Source File: nlp.py    From GraphDash with Apache License 2.0 5 votes vote down vote up
def __init__(self, language):
        self._stop_words = set(stop_words.get_stop_words(language)) 
Example #14
Source File: scrape1.py    From Web-Scraping with MIT License 5 votes vote down vote up
def clean_up_words(words):
    new_words = [] # empty list
    pkg_stop_words = get_stop_words('en')
    my_stop_words = ['the', 'is', 'and', 'thisfacebooktwitteremailredditprint']
    for word in words:
        word = word.lower()
        cleaned_word = clean_word(word)
        if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words:
            pass
        else:
            new_words.append(cleaned_word)
    return new_words 
Example #15
Source File: preprocessing.py    From TBBTCorpus with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.episodeInfo = {}
        self.Info = []
        self.allTranscripts = {}
        self.vocabulary = collections.defaultdict(int)
        self.Stopwords = get_stop_words('en')
        self.impactActors = ["Leonard","Sheldon","Penny", "Howard","Raj","Amy","Bernadette"] 
Example #16
Source File: topic_extractor.py    From TBBTCorpus with Apache License 2.0 5 votes vote down vote up
def __remove_stop_words(self, docs):
		output = []
		for doc in docs:
			en_stop = get_stop_words('en')
			stopped_tokens = [i for i in doc if not i in en_stop]
			output.append(stopped_tokens)
		return output 
Example #17
Source File: instance_preprocessing.py    From sciwing with MIT License 5 votes vote down vote up
def __init__(self):
        self.stop_words = get_stop_words("en")