Python gensim.utils.smart_open() Examples

The following are 30 code examples of gensim.utils.smart_open(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.utils , or try the search function .
Example #1
Source File: ldamallet.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15) 
Example #2
Source File: ldamallet.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15) 
Example #3
Source File: dictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line)) 
Example #4
Source File: hashdictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary 
Example #5
Source File: hashdictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary 
Example #6
Source File: ucicorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz)) 
Example #7
Source File: ucicorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz)) 
Example #8
Source File: dictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line)) 
Example #9
Source File: ldamallet.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15) 
Example #10
Source File: svmlightcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets 
Example #11
Source File: svmlightcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets 
Example #12
Source File: ucicorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz)) 
Example #13
Source File: dictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line)) 
Example #14
Source File: hashdictionary.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary 
Example #15
Source File: svmlightcorpus.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets 
Example #16
Source File: ucicorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = fname + '.vocab'

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True 
Example #17
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #18
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_texts(self):
		total_docs = 0
		if os.path.isdir( self.input ):
			# Read two levels of files
			filenames = glob.glob('{}/*'.format(self.input))
			for filename in filenames:
				if os.path.isdir(filename):
					filenames += glob.glob('{}/*'.format(filename))
			for filename in filenames:
				if not os.path.isdir( filename ):
					with utils.smart_open( filename ) as f:
						docId = filename
						docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
						tokens = self.tokenRegex.findall(docContent)
						tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
						yield tokens
						self.docIds.append(docId)
						total_docs += 1
		else:
			with utils.smart_open(self.input) as f:
				for line in f:
					docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					tokens = self.tokenRegex.findall(docContent)
					tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
					yield tokens
					self.docIds.append(docId)
					total_docs += 1
		self.length = total_docs 
Example #19
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #20
Source File: lowcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass LowCorpus 
Example #21
Source File: GensimLDA.py    From termite-data-server with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_texts(self):
		total_docs = 0
		if os.path.isdir( self.input ):
			# Read two levels of files
			filenames = glob.glob('{}/*'.format(self.input))
			for filename in filenames:
				if os.path.isdir(filename):
					filenames += glob.glob('{}/*'.format(filename))
			for filename in filenames:
				if not os.path.isdir( filename ):
					with utils.smart_open( filename ) as f:
						docId = filename
						docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
						tokens = self.tokenRegex.findall(docContent)
						tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
						yield tokens
						self.docIds.append(docId)
						total_docs += 1
		else:
			with utils.smart_open(self.input) as f:
				for line in f:
					docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					tokens = self.tokenRegex.findall(docContent)
					tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
					yield tokens
					self.docIds.append(docId)
					total_docs += 1
		self.length = total_docs 
Example #22
Source File: malletcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass MalletCorpus 
Example #23
Source File: malletcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        """
        Iterate over the corpus at the given filename.

        Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
        """
        with utils.smart_open(self.fname) as f:
            for line in f:
                yield self.line2doc(line) 
Example #24
Source File: svmlightcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())[0] 
Example #25
Source File: svmlightcorpus.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def __iter__(self):
        """
        Iterate over the corpus, returning one sparse vector at a time.
        """
        lineno = -1
        self.labels = []
        with utils.smart_open(self.fname) as fin:
            for lineno, line in enumerate(fin):
                doc = self.line2doc(line)
                if doc is not None:
                    if self.store_labels:
                        self.labels.append(doc[1])
                    yield doc[0]
        self.length = lineno + 1 
Example #26
Source File: interfaces.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus: # iterate over the document stream
                fmt = str(doc) # format the document appropriately...
                fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
#endclass CorpusABC 
Example #27
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators 
Example #28
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #29
Source File: test_word2vec.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split()) 
Example #30
Source File: preprocessing.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def read_file(path):
    with utils.smart_open(path) as fin:
        return fin.read()