Python nltk.corpus.reuters.fileids() Examples

The following are 3 code examples of nltk.corpus.reuters.fileids(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.corpus.reuters , or try the search function .
Example #1
Source File: setuptextcorpus.py    From rnn-speech with MIT License 5 votes vote down vote up
def get_corpus_text():
    '''
    return raw text of reuters corpus
    '''
    return [" ".join(reuters.words(fid)) for fid in reuters.fileids()] 
Example #2
Source File: preprocessing.py    From Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction with MIT License 5 votes vote down vote up
def gen_financial_top_words(maxN=40000): # generate corpus based on Reuters news
    if not os.path.isfile('./input/topWords.json'):
        wordCnt = {}
        for field in reuters.fileids():
            for word in reuters.words(field):
                word = unify_word(word)
                if word in nltk.corpus.stopwords.words('english'):
                    continue
                wordCnt[word] = wordCnt.get(word, 0) + 1

        sorted_wordCnt = sorted(wordCnt.items(), key=operator.itemgetter(1), reverse=True)
        wordCnt = {} # reset wordCnt
        for i in sorted_wordCnt[:maxN]: wordCnt[i[0]] = i[1] # convert list to dict
        with open('./input/topWords.json', 'w') as fout: json.dump(wordCnt, fout, indent=4)
    else: return 
Example #3
Source File: word_embedding.py    From Sentiment-Analysis-in-Event-Driven-Stock-Price-Movement-Prediction with MIT License 4 votes vote down vote up
def get_reuters_data(n_vocab):
    # return variables
    sentences = []
    word2idx = {'START': 0, 'END': 1}
    idx2word = ['START', 'END']
    current_idx = 2
    word_idx_count = {0: float('inf'), 1: float('inf')}
    tag = 0
    for field in reuters.fileids():
        sentence = reuters.words(field)
        tokens = [unify_word(t) for t in sentence]
        for t in tokens:
            if t not in word2idx:
                word2idx[t] = current_idx
                idx2word.append(t)
                current_idx += 1
            idx = word2idx[t]
            word_idx_count[idx] = word_idx_count.get(idx, 0) + 1
        sentence_by_idx = [word2idx[t] for t in tokens]
        sentences.append(sentence_by_idx)
        tag += 1
        print(tag)

    # restrict vocab size
    sorted_word_idx_count = sorted(word_idx_count.items(), key=operator.itemgetter(1), reverse=True)
    word2idx_small = {}
    new_idx = 0
    idx_new_idx_map = {}
    for idx, count in sorted_word_idx_count[:n_vocab]:
        word = idx2word[idx]
        print word, count
        word2idx_small[word] = new_idx
        idx_new_idx_map[idx] = new_idx
        new_idx += 1
    # let 'unknown' be the last token
    word2idx_small['UNKNOWN'] = new_idx 
    unknown = new_idx

    # map old idx to new idx
    sentences_small = []
    for sentence in sentences:
        if len(sentence) > 1:
            new_sentence = [idx_new_idx_map[idx] if idx in idx_new_idx_map else unknown for idx in sentence]
            sentences_small.append(new_sentence)

    return sentences_small, word2idx_small