import os import nltk import itertools import codecs from toolz.functoolz import compose import cPickle as pickle CURDIR = os.path.dirname(os.path.realpath(__file__)) def load_line_corpus(path, tokenize=True): docs = [] with codecs.open(path, "r", "utf8") as f: for l in f: if tokenize: sents = nltk.sent_tokenize(l.strip().lower()) docs.append(list(itertools.chain(*map( nltk.word_tokenize, sents)))) else: docs.append(l.strip()) return docs def load_nips(years=None, raw=False): # load data if not years: years = xrange(2008, 2015) files = ['nips-{}.dat'.format(year) for year in years] docs = [] for f in files: docs += load_line_corpus('{}/datasets/{}'.format(CURDIR, f), tokenize=(not raw)) return docs def load_lemur_stopwords(): with codecs.open(CURDIR + '/datasets/lemur-stopwords.txt', 'r' 'utf8') as f: return map(lambda s: s.strip(), f.readlines())