Python stop_words.get_stop_words() Examples

The following are 17 code examples of stop_words.get_stop_words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module stop_words , or try the search function

Example #1

Source File: lex_sem_ft.py From DL-text with MIT License

6 votes

def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):

Example #2

Source File: lex_sem_ft.py From DeepLearn with MIT License

6 votes

def LDA_train(doc):
    red = []
    en_stop = get_stop_words('en')
    for d in doc:
        try:
            raw = d.lower()
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            red.append(stopped_tokens)
        except:
            continue
    print("Forming Dictionary.....")
    dictionary = corpora.Dictionary(red)
    print("Forming Corpus.....")
    corpus = [dictionary.doc2bow(text) for text in red]
    print("Training Model.....")
    lda = models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=1)
    return lda

#Returns Average Of Probablity Of Word Present In LDA Model For Input Document(Returns Float):

Example #3

Source File: scrape2.py From Web-Scraping with MIT License

6 votes

def clean_up_words(words):
    new_words = [] # empty list
    pkg_stop_words = get_stop_words('en')
    my_stop_words = [
                'the', 
                'is', 
                'and', 
                'thisfacebooktwitteremailredditprint', 
                '',
                'reply',
                'likelike',
                'likeliked',
                'comments',
                'commenting',
                '/',
                '='
                ]
    for word in words:
        word = word.lower()
        cleaned_word = clean_word(word)
        if cleaned_word in my_stop_words or cleaned_word in pkg_stop_words:
            pass
        else:
            new_words.append(cleaned_word)
    return new_words

Example #4

Source File: eval_utils.py From dataset_agnostic_segmentation with MIT License

5 votes

def phoc_spottig(it, folder, ignore_stop_words=False, iou_thresh=0.5, dont_load=False, use_gt_phoc=False, filter_small=False, logger=None, max_word_num=None):
    """
    Evaluate mAP for phoc based word spotting using an evaluation folder (contating *.json) produced by main.py script.
    assumes folder contains json files with {'word_%d': {'gt': gt_box, 'text': word annotation, 'cover': IoU of predicted word with GT box,
                                                          'gt_phoc': PHOC for text, 'pre_phoc': predicted PHOC}}

    Performs two tasks:
        (1) Prepare query words
        (2) Call query function on query words
    """
    # Create all query words - based on test set ground truth
    if logger is None:
        logger = print
    qwords = []
    for page in it:
        # Some of the datasest (e.g. IAMDB) have words with bad annotations, those are ignored by the eval protocol
        words = page.get_good_words_and_boxes_idx()[1]
        qwords.extend(words)
    qwords = set(qwords)

    logger('Query Words %d' % len(qwords))
    if ignore_stop_words:
        # If there are stop words to be removed...
        qwords = qwords - set(get_stop_words('en'))
        logger('Without stop words %d' % len(qwords))

    if max_word_num is not None:
        # Subsample all queries to partially evaluate
        if len(qwords) > max_word_num:
            idx = np.random.choice(range(len(qwords)), max_word_num, replace=False)
            qwords = set(np.array(list(qwords))[idx])
            logger('Sampled %d words' % len(qwords))

    qtimer = Timer()
    qtimer.tic()
    mAP, recall, accuracy = query_page_folder_phoc(qwords, folder, threshold=iou_thresh, dont_load=dont_load, use_gt_phoc=use_gt_phoc, filter_small=filter_small)
    logger('Finished after %d secs mAP %4.2f Recall %4.2f Accuracy %4.2f' % (qtimer.toc(), mAP*100, recall*100, accuracy*100))
    return

Example #5

Source File: lda_model_calculator.py From moviegeek with MIT License

5 votes

def remove_stopwords(tokenized_data):

        en_stop = get_stop_words('en')

        stopped_tokens = [token for token in tokenized_data if token not in en_stop]
        return stopped_tokens

Example #6