python source code of lowcorpus

topical_word_embeddings-master
- TWE-3
  - gensim
    - interfaces.pyc
    - interfaces.py
    - examples
      - dmlcz
        gensim_build.py
        runall.sh
        gensim_genmodel.py
        dmlcorpus.py
        sources.py
        gensim_xml.py
        __init__.py
    - corpora
      - svmlightcorpus.py
      - wikicorpus.py
      - lowcorpus.pyc
      - ucicorpus.pyc
      - svmlightcorpus.pyc
      - mmcorpus.py
      - lowcorpus.py
      - textcorpus.py
      - malletcorpus.py
      - bleicorpus.py
      - hashdictionary.py
      - __init__.pyc
      - ucicorpus.py
      - indexedcorpus.pyc
      - indexedcorpus.py
      - wikicorpus.pyc
      - malletcorpus.pyc
      - hashdictionary.pyc
      - __init__.py
      - dictionary.pyc
      - bleicorpus.pyc
      - mmcorpus.pyc
      - textcorpus.pyc
      - dictionary.py
      - csvcorpus.py
    - test
      - test_utils.py
      - simspeed.py
      - test_corpora_dictionary.py
      - test_lee.py
      - svd_error.py
      - simspeed2.py
      - test_data
        testcorpus.low
        test_corpus_ok.mm
        miIslita.cor
        testcorpus.blei.vocab
        testcorpus.mallet
        testcorpus.svmlight
        test_corpus_small.mm
        testcorpus.txt
        similarities0-1.txt
        testcorpus.blei
        testcorpus.uci.vocab
        testcorpus.uci
        para2para_text2.txt
        testcorpus.mm
        lee.cor
        para2para_text1.txt
      - test_models.py
      - test_miislita.py
      - test_similarities.py
      - test_word2vec.py
      - __init__.py
      - test_big.py
      - test_corpora.py
      - test_corpora_hashdictionary.py
      - test_parsing.py
    - models
      - word2vec.py
      - voidptr.h
      - ldamallet.pyc
      - lsimodel.py
      - logentropy_model.py
      - lsi_dispatcher.py
      - lda_worker.py
      - tfidfmodel.pyc
      - word2vec.pyc
      - rpmodel.pyc
      - __init__.pyc
      - tfidfmodel.py
      - ldamodel.py
      - ldamallet.py
      - word2vec_inner.pyx
      - __init__.py
      - lsimodel.pyc
      - logentropy_model.pyc
      - lda_dispatcher.py
      - hdpmodel.py
      - rpmodel.py
      - lsi_worker.py
      - hdpmodel.pyc
      - ldamodel.pyc
    - __init__.pyc
    - matutils.pyc
    - parsing
      - preprocessing.py
      - __init__.py
      - porter.py
    - matutils.py
    - __init__.py
    - utils.py
    - scripts
      - make_wiki.py
      - make_wiki_lemma.py
      - make_wiki_online_lemma.py
      - __init__.py
      - make_wiki_online.py
      - make_wiki_online_nodebug.py
      - make_wikicorpus.py
    - nosy.py
    - utils.pyc
    - similarities
      - docsim.py
      - docsim.pyc
      - __init__.pyc
      - __init__.py
  - COPYING.LESSER
  - tmp
    - tmp_file_here
  - ez_setup.py
  - Readme.md
  - output
    - output_file_here
  - gensim_addons
    - models
      - word2vec_inner.pyx
      - __init__.py
    - __init__.py
  - COPYING
  - setup.py
  - train.py
  - setup.cfg
  - __init__.py
  - CHANGELOG.txt
  - .travis.yml
  - docs
    - src
      - install.rst
      - indextoc.rst
      - Makefile
      - intro.rst
      - corpora
        corpora.rst
        bleicorpus.rst
        lowcorpus.rst
        wikicorpus.rst
        svmlightcorpus.rst
        textcorpus.rst
        mmcorpus.rst
        hashdictionary.rst
        dictionary.rst
        ucicorpus.rst
        indexedcorpus.rst
      - about.rst
      - tut1.rst
      - wiki.rst
      - models
        lda_dispatcher.rst
        logentropy_model.rst
        word2vec.rst
        lda_worker.rst
        lsi_dispatcher.rst
        rpmodel.rst
        hdpmodel.rst
        ldamodel.rst
        ldamallet.rst
        lsimodel.rst
        lsi_worker.rst
        models.rst
        tfidfmodel.rst
      - tutorial.rst
      - support.rst
      - dist_lsi.rst
      - apiref.rst
      - utils.rst
      - tut2.rst
      - matutils.rst
      - gensim_theme
        page.html
        theme.conf
        genindex.html
        layout.html
        search.html
        domainindex.html
        static
        jquery.js
        doctools.js
        underscore.js
      - interfaces.rst
      - conf.py
      - tut3.rst
      - distributed.rst
      - changes_080.rst
      - simserver.rst
      - _templates
        indexcontent.html
      - _static
        images
        loading.gif
        favicon.ico
        features
        references
        logo_issuu.jpeg
        logo_dtu.gif
        favicon.ico
        js
        jquery.qtip.min.js
        jquery-migrate-1.1.1.min.js
        jquery-1.9.1.min.js
        jquery.anythingslider.min.js
        css
        jquery.qtip.min.css
        style.css
        anythingslider.css
      - dist_lda.rst
      - similarities
        docsim.rst
        simserver.rst
  - MANIFEST.in
- LICENSE
- README.md
- TWE-1
  - gensim
    - interfaces.pyc
    - interfaces.py
    - examples
      - dmlcz
        gensim_build.py
        runall.sh
        gensim_genmodel.py
        dmlcorpus.py
        sources.py
        gensim_xml.py
        __init__.py
    - corpora
      - svmlightcorpus.py
      - wikicorpus.py
      - lowcorpus.pyc
      - ucicorpus.pyc
      - svmlightcorpus.pyc
      - mmcorpus.py
      - lowcorpus.py
      - textcorpus.py
      - malletcorpus.py
      - bleicorpus.py
      - hashdictionary.py
      - __init__.pyc
      - ucicorpus.py
      - indexedcorpus.pyc
      - indexedcorpus.py
      - wikicorpus.pyc
      - malletcorpus.pyc
      - hashdictionary.pyc
      - __init__.py
      - dictionary.pyc
      - bleicorpus.pyc
      - mmcorpus.pyc
      - textcorpus.pyc
      - dictionary.py
      - csvcorpus.py
    - test
      - test_utils.py
      - simspeed.py
      - test_corpora_dictionary.py
      - test_lee.py
      - svd_error.py
      - simspeed2.py
      - test_data
        testcorpus.low
        test_corpus_ok.mm
        miIslita.cor
        testcorpus.blei.vocab
        testcorpus.mallet
        testcorpus.svmlight
        test_corpus_small.mm
        testcorpus.txt
        similarities0-1.txt
        testcorpus.blei
        testcorpus.uci.vocab
        testcorpus.uci
        para2para_text2.txt
        testcorpus.mm
        lee.cor
        para2para_text1.txt
      - test_models.py
      - test_miislita.py
      - test_similarities.py
      - test_word2vec.py
      - __init__.py
      - test_big.py
      - test_corpora.py
      - test_corpora_hashdictionary.py
      - test_parsing.py
    - models
      - word2vec.py
      - voidptr.h
      - ldamallet.pyc
      - lsimodel.py
      - logentropy_model.py
      - lsi_dispatcher.py
      - lda_worker.py
      - tfidfmodel.pyc
      - word2vec.pyc
      - rpmodel.pyc
      - __init__.pyc
      - tfidfmodel.py
      - ldamodel.py
      - .word2vec_inner.pyx.swp
      - ldamallet.py
      - word2vec_inner.pyx
      - __init__.py
      - lsimodel.pyc
      - logentropy_model.pyc
      - lda_dispatcher.py
      - hdpmodel.py
      - rpmodel.py
      - lsi_worker.py
      - hdpmodel.pyc
      - ldamodel.pyc
    - __init__.pyc
    - matutils.pyc
    - parsing
      - preprocessing.py
      - __init__.py
      - porter.py
    - matutils.py
    - __init__.py
    - utils.py
    - scripts
      - make_wiki.py
      - make_wiki_lemma.py
      - make_wiki_online_lemma.py
      - __init__.py
      - make_wiki_online.py
      - make_wiki_online_nodebug.py
      - make_wikicorpus.py
    - nosy.py
    - utils.pyc
    - similarities
      - docsim.py
      - docsim.pyc
      - __init__.pyc
      - __init__.py
  - COPYING.LESSER
  - tmp
    - tmp_file_here
  - ez_setup.py
  - Readme.md
  - output
    - output_file_here
  - gensim_addons
    - models
      - word2vec_inner.pyx
      - __init__.py
    - __init__.py
  - __init__.pyc
  - COPYING
  - setup.py
  - pre_process.pyc
  - train.py
  - word2vec_inner.pyx
  - setup.cfg
  - __init__.py
  - CHANGELOG.txt
  - .travis.yml
  - docs
    - src
      - install.rst
      - indextoc.rst
      - Makefile
      - intro.rst
      - corpora
        corpora.rst
        bleicorpus.rst
        lowcorpus.rst
        wikicorpus.rst
        svmlightcorpus.rst
        textcorpus.rst
        mmcorpus.rst
        hashdictionary.rst
        dictionary.rst
        ucicorpus.rst
        indexedcorpus.rst
      - about.rst
      - tut1.rst
      - wiki.rst
      - models
        lda_dispatcher.rst
        logentropy_model.rst
        word2vec.rst
        lda_worker.rst
        lsi_dispatcher.rst
        rpmodel.rst
        hdpmodel.rst
        ldamodel.rst
        ldamallet.rst
        lsimodel.rst
        lsi_worker.rst
        models.rst
        tfidfmodel.rst
      - tutorial.rst
      - support.rst
      - dist_lsi.rst
      - apiref.rst
      - utils.rst
      - tut2.rst
      - matutils.rst
      - gensim_theme
        page.html
        theme.conf
        genindex.html
        layout.html
        search.html
        domainindex.html
        static
        jquery.js
        doctools.js
        underscore.js
      - interfaces.rst
      - conf.py
      - tut3.rst
      - distributed.rst
      - changes_080.rst
      - simserver.rst
      - _templates
        indexcontent.html
      - _static
        images
        loading.gif
        favicon.ico
        features
        references
        logo_issuu.jpeg
        logo_dtu.gif
        favicon.ico
        js
        jquery.qtip.min.js
        jquery-migrate-1.1.1.min.js
        jquery-1.9.1.min.js
        jquery.anythingslider.min.js
        css
        jquery.qtip.min.css
        style.css
        anythingslider.css
      - dist_lda.rst
      - similarities
        docsim.rst
        simserver.rst
  - MANIFEST.in
  - pre_process.py
- TWE-2
  - gensim
    - interfaces.pyc
    - interfaces.py
    - examples
      - dmlcz
        gensim_build.py
        runall.sh
        gensim_genmodel.py
        dmlcorpus.py
        sources.py
        gensim_xml.py
        __init__.py
    - corpora
      - svmlightcorpus.py
      - wikicorpus.py
      - lowcorpus.pyc
      - ucicorpus.pyc
      - svmlightcorpus.pyc
      - mmcorpus.py
      - lowcorpus.py
      - textcorpus.py
      - malletcorpus.py
      - bleicorpus.py
      - hashdictionary.py
      - __init__.pyc
      - ucicorpus.py
      - indexedcorpus.pyc
      - indexedcorpus.py
      - wikicorpus.pyc
      - malletcorpus.pyc
      - hashdictionary.pyc
      - __init__.py
      - dictionary.pyc
      - bleicorpus.pyc
      - mmcorpus.pyc
      - textcorpus.pyc
      - dictionary.py
      - csvcorpus.py
    - test
      - test_utils.py
      - simspeed.py
      - test_corpora_dictionary.py
      - test_lee.py
      - svd_error.py
      - simspeed2.py
      - test_data
        testcorpus.low
        test_corpus_ok.mm
        miIslita.cor
        testcorpus.blei.vocab
        testcorpus.mallet
        testcorpus.svmlight
        test_corpus_small.mm
        testcorpus.txt
        similarities0-1.txt
        testcorpus.blei
        testcorpus.uci.vocab
        testcorpus.uci
        para2para_text2.txt
        testcorpus.mm
        lee.cor
        para2para_text1.txt
      - test_models.py
      - test_miislita.py
      - test_similarities.py
      - test_word2vec.py
      - __init__.py
      - test_big.py
      - test_corpora.py
      - test_corpora_hashdictionary.py
      - test_parsing.py
    - models
      - word2vec.py
      - voidptr.h
      - ldamallet.pyc
      - lsimodel.py
      - logentropy_model.py
      - lsi_dispatcher.py
      - lda_worker.py
      - tfidfmodel.pyc
      - word2vec.pyc
      - rpmodel.pyc
      - __init__.pyc
      - tfidfmodel.py
      - ldamodel.py
      - ldamallet.py
      - word2vec_inner.pyx
      - __init__.py
      - lsimodel.pyc
      - logentropy_model.pyc
      - lda_dispatcher.py
      - hdpmodel.py
      - rpmodel.py
      - lsi_worker.py
      - hdpmodel.pyc
      - ldamodel.pyc
    - __init__.pyc
    - matutils.pyc
    - parsing
      - preprocessing.py
      - __init__.py
      - porter.py
    - matutils.py
    - __init__.py
    - utils.py
    - scripts
      - make_wiki.py
      - make_wiki_lemma.py
      - make_wiki_online_lemma.py
      - __init__.py
      - make_wiki_online.py
      - make_wiki_online_nodebug.py
      - make_wikicorpus.py
    - nosy.py
    - utils.pyc
    - similarities
      - docsim.py
      - docsim.pyc
      - __init__.pyc
      - __init__.py
  - COPYING.LESSER
  - tmp
    - tmp_file_here
  - ez_setup.py
  - Readme.md
  - output
    - output_file_here
  - gensim_addons
    - models
      - word2vec_inner.pyx
      - __init__.py
    - __init__.py
  - COPYING
  - setup.py
  - last_last_step.py
  - train.py
  - setup.cfg
  - __init__.py
  - CHANGELOG.txt
  - .travis.yml
  - docs
    - src
      - install.rst
      - indextoc.rst
      - Makefile
      - intro.rst
      - corpora
        corpora.rst
        bleicorpus.rst
        lowcorpus.rst
        wikicorpus.rst
        svmlightcorpus.rst
        textcorpus.rst
        mmcorpus.rst
        hashdictionary.rst
        dictionary.rst
        ucicorpus.rst
        indexedcorpus.rst
      - about.rst
      - tut1.rst
      - wiki.rst
      - models
        lda_dispatcher.rst
        logentropy_model.rst
        word2vec.rst
        lda_worker.rst
        lsi_dispatcher.rst
        rpmodel.rst
        hdpmodel.rst
        ldamodel.rst
        ldamallet.rst
        lsimodel.rst
        lsi_worker.rst
        models.rst
        tfidfmodel.rst
      - tutorial.rst
      - support.rst
      - dist_lsi.rst
      - apiref.rst
      - utils.rst
      - tut2.rst
      - matutils.rst
      - gensim_theme
        page.html
        theme.conf
        genindex.html
        layout.html
        search.html
        domainindex.html
        static
        jquery.js
        doctools.js
        underscore.js
      - interfaces.rst
      - conf.py
      - tut3.rst
      - distributed.rst
      - changes_080.rst
      - simserver.rst
      - _templates
        indexcontent.html
      - _static
        images
        loading.gif
        favicon.ico
        features
        references
        logo_issuu.jpeg
        logo_dtu.gif
        favicon.ico
        js
        jquery.qtip.min.js
        jquery-migrate-1.1.1.min.js
        jquery-1.9.1.min.js
        jquery.anythingslider.min.js
        css
        jquery.qtip.min.css
        style.css
        anythingslider.css
      - dist_lda.rst
      - similarities
        docsim.rst
        simserver.rst
  - MANIFEST.in

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
Corpus in GibbsLda++ format of List-Of-Words.
"""

from __future__ import with_statement

import logging

from gensim import utils
from gensim.corpora import IndexedCorpus
from six import iteritems, iterkeys
from six.moves import xrange, zip as izip


logger = logging.getLogger('gensim.corpora.lowcorpus')


def split_on_space(s):
    return [word for word in utils.to_unicode(s).strip().split(' ') if word]


class LowCorpus(IndexedCorpus):
    """
    List_Of_Words corpus handles input in GibbsLda++ format.

    Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format::

        Both data for training/estimating the model and new data (i.e., previously
        unseen data) have the same format as follows:

        [M]
        [document1]
        [document2]
        ...
        [documentM]

        in which the first line is the total number for documents [M]. Each line
        after that is one document. [documenti] is the ith document of the dataset
        that consists of a list of Ni words/terms.

        [documenti] = [wordi1] [wordi2] ... [wordiNi]

        in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated
        by the blank character.
    """
    def __init__(self, fname, id2word=None, line2words=split_on_space):
        """
        Initialize the corpus from a file.

        `id2word` and `line2words` are optional parameters.
        If provided, `id2word` is a dictionary mapping between word_ids (integers)
        and words (strings). If not provided, the mapping is constructed from
        the documents.

        `line2words` is a function which converts lines into tokens. Defaults to
        simple splitting on spaces.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        self.fname = fname # input file, see class doc for format
        self.line2words = line2words # how to translate lines into words (simply split on space by default)
        self.num_docs = self._calculate_num_docs()

        if not id2word:
            # build a list of all word types in the corpus (distinct words)
            logger.info("extracting vocabulary from the corpus")
            all_terms = set()
            self.use_wordids = False # return documents as (word, wordCount) 2-tuples
            for doc in self:
                all_terms.update(word for word, wordCnt in doc)
            all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
            self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
        else:
            logger.info("using provided word mapping (%i ids)" % len(id2word))
            self.id2word = id2word
        self.word2id = dict((v, k) for k, v in iteritems(self.id2word))
        self.num_terms = len(self.word2id)
        self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples

        logger.info("loaded corpus with %i documents and %i terms from %s" %
                     (self.num_docs, self.num_terms, fname))

    def _calculate_num_docs(self):
        # the first line in input data is the number of documents (integer). throws exception on bad input.
        with utils.smart_open(self.fname) as fin:
            try:
                result = int(next(fin))
            except StopIteration:
                result = 0

        return result

    def __len__(self):
        return self.num_docs

    def line2doc(self, line):
        words = self.line2words(line)

        if self.use_wordids:
            # get all distinct terms in this document, ignore unknown words
            uniq_words = set(words).intersection(iterkeys(self.word2id))

            # the following creates a unique list of words *in the same order*
            # as they were in the input. when iterating over the documents,
            # the (word, count) pairs will appear in the same order as they
            # were in the input (bar duplicates), which looks better.
            # if this was not needed, we might as well have used useWords = set(words)
            use_words, marker = [], set()
            for word in words:
                if (word in uniq_words) and (word not in marker):
                    use_words.append(word)
                    marker.add(word)
            # construct a list of (wordIndex, wordFrequency) 2-tuples
            doc = list(zip(map(self.word2id.get, use_words),
                      map(words.count, use_words)))
        else:
            uniq_words = set(words)
            # construct a list of (word, wordFrequency) 2-tuples
            doc = list(zip(uniq_words, map(words.count, uniq_words)))

        # return the document, then forget it and move on to the next one
        # note that this way, only one doc is stored in memory at a time, not the whole corpus
        return doc

    def __iter__(self):
        """
        Iterate over the corpus, returning one bag-of-words vector at a time.
        """
        with utils.smart_open(self.fname) as fin:
            for lineno, line in enumerate(fin):
                if lineno > 0: # ignore the first line = number of documents
                    yield self.line2doc(line)

    @staticmethod
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning("List-of-words format can only save vectors with "
                            "integer elements; %i float entries were truncated to integer value" %
                            truncated)
        return offsets

    def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass LowCorpus