Python gensim.models.ldamodel.LdaModel() Examples

The following are 30 code examples of gensim.models.ldamodel.LdaModel(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.ldamodel , or try the search function .
Example #1
Source File: tm_gensim.py    From tmtoolkit with Apache License 2.0 6 votes vote down vote up
def fit_model(self, data, params, return_data=False):
        """
        Fit model to `data` using gensim with parameter set `params`.
        """
        from gensim.models.ldamodel import LdaModel

        dictionary = params.pop('dictionary', None)

        if hasattr(data, 'dtype') and hasattr(data, 'shape') and hasattr(data, 'transpose'):
            corpus = dtm_to_gensim_corpus(data)
            dtm = data
        else:
            if isinstance(data, tuple) and len(data) == 2:
                dictionary, corpus = data
            else:
                corpus = data
            dtm = gensim_corpus_to_dtm(corpus)

        model = LdaModel(corpus, id2word=dictionary, **params)

        if return_data:
            return model, (corpus, dtm)
        else:
            return model 
Example #2
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testLargeMmap(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel 
Example #3
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed) 
Example #4
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testLargeMmap(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel 
Example #5
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed) 
Example #6
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testLargeMmap(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel 
Example #7
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed) 
Example #8
Source File: similarity.py    From bugbug with Mozilla Public License 2.0 6 votes vote down vote up
def __init__(
        self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8
    ):
        super().__init__(
            cleanup_urls=cleanup_urls,
            nltk_tokenizer=nltk_tokenizer,
            confidence_threshold=confidence_threshold,
        )
        self.corpus = []
        self.bug_ids = []
        for bug in bugzilla.get_bugs():
            self.corpus.append(self.text_preprocess(self.get_text(bug)))
            self.bug_ids.append(bug["id"])

        indexes = list(range(len(self.corpus)))
        random.shuffle(indexes)
        self.corpus = [self.corpus[idx] for idx in indexes]
        self.bug_ids = [self.bug_ids[idx] for idx in indexes]

        self.dictionary = Dictionary(self.corpus)

        self.model = LdaModel([self.dictionary.doc2bow(text) for text in self.corpus]) 
Example #9
Source File: lda_context_utils.py    From yelp with GNU Lesser General Public License v2.1 6 votes vote down vote up
def update_reviews_with_topics(topic_model, corpus_list, reviews):
    """

    :type minimum_probability: float
    :param minimum_probability:
    :type topic_model: LdaModel
    :param topic_model:
    :type corpus_list: list
    :param reviews:
    """
    # print('reviews length', len(reviews))

    for review, corpus in zip(reviews, corpus_list):
        review[Constants.TOPICS_FIELD] =\
            topic_model.get_document_topics(corpus)

        non_zero_topics = [topic[0] for topic in review[Constants.TOPICS_FIELD]]

        for topic_index in range(Constants.TOPIC_MODEL_NUM_TOPICS):
            if topic_index not in non_zero_topics:
                review[Constants.TOPICS_FIELD].insert(
                    topic_index, [topic_index, 0.0]) 
Example #10
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testLargeMmap(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel 
Example #11
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed) 
Example #12
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testLargeMmap(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel 
Example #13
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed) 
Example #14
Source File: test_models.py    From topical_word_embeddings with MIT License 6 votes vote down vote up
def testTransform(self):
        passed = False
        # sometimes, LDA training gets stuck at a local minimum
        # in that case try re-training the model from scratch, hoping for a
        # better random initialization
        for i in range(5): # restart at most 5 times
            # create the transformation model
            model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=100)
            model.update(corpus)

            # transform one document
            doc = list(corpus)[0]
            transformed = model[doc]

            vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
            expected = [0.13, 0.87]
            passed = numpy.allclose(sorted(vec), sorted(expected), atol=1e-2) # must contain the same values, up to re-ordering
            if passed:
                break
            logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                            (i, sorted(vec), sorted(expected)))
        self.assertTrue(passed) 
Example #15
Source File: lda_worker.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def initialize(self, myid, dispatcher, **model_params):
        self.lock_update = threading.Lock()
        self.jobsdone = 0 # how many jobs has this worker completed?
        self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s" % myid)
        self.model = ldamodel.LdaModel(**model_params) 
Example #16
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testPersistence(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)
        model.save(testfile())
        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 
Example #17
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testTopicSeeding(self):
        passed = False
        for topic in range(2):
            # try seeding it both ways round, check you get the same
            # topics out but with which way round they are depending
            # on the way round they're seeded
            for i in range(5): # restart at most 5 times

                eta = numpy.ones((2, len(dictionary))) * 0.5
                system = dictionary.token2id[u'system']
                trees = dictionary.token2id[u'trees']

                # aggressively seed the word 'system', in one of the
                # two topics, 10 times higher than the other words
                eta[topic, system] *= 10

                model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta)
                model.update(corpus)

                topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]

                # check that the word system in the topic we seeded, got a high weight,
                # and the word 'trees' (the main word in the other topic) a low weight --
                # and vice versa for the other topic (which we didn't seed with 'system')
                result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
                          [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
                expected = [[0.385, 0.022],
                            [0.025, 0.157]]
                passed = numpy.allclose(result, expected, atol=1e-2)
                if passed:
                    break
                logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                                (i, result, expected))
            self.assertTrue(passed) 
Example #18
Source File: lda_worker.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def initialize(self, myid, dispatcher, **model_params):
        self.lock_update = threading.Lock()
        self.jobsdone = 0 # how many jobs has this worker completed?
        self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s" % myid)
        self.model = ldamodel.LdaModel(**model_params) 
Example #19
Source File: lda_worker.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def initialize(self, myid, dispatcher, **model_params):
        self.lock_update = threading.Lock()
        self.jobsdone = 0 # how many jobs has this worker completed?
        self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s" % myid)
        self.model = ldamodel.LdaModel(**model_params) 
Example #20
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testPersistence(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)
        model.save(testfile())
        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 
Example #21
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testTopicSeeding(self):
        passed = False
        for topic in range(2):
            # try seeding it both ways round, check you get the same
            # topics out but with which way round they are depending
            # on the way round they're seeded
            for i in range(5): # restart at most 5 times

                eta = numpy.ones((2, len(dictionary))) * 0.5
                system = dictionary.token2id[u'system']
                trees = dictionary.token2id[u'trees']

                # aggressively seed the word 'system', in one of the
                # two topics, 10 times higher than the other words
                eta[topic, system] *= 10

                model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta)
                model.update(corpus)

                topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]

                # check that the word system in the topic we seeded, got a high weight,
                # and the word 'trees' (the main word in the other topic) a low weight --
                # and vice versa for the other topic (which we didn't seed with 'system')
                result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
                          [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
                expected = [[0.385, 0.022],
                            [0.025, 0.157]]
                passed = numpy.allclose(result, expected, atol=1e-2)
                if passed:
                    break
                logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                                (i, result, expected))
            self.assertTrue(passed) 
Example #22
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testTopicSeeding(self):
        passed = False
        for topic in range(2):
            # try seeding it both ways round, check you get the same
            # topics out but with which way round they are depending
            # on the way round they're seeded
            for i in range(5): # restart at most 5 times

                eta = numpy.ones((2, len(dictionary))) * 0.5
                system = dictionary.token2id[u'system']
                trees = dictionary.token2id[u'trees']

                # aggressively seed the word 'system', in one of the
                # two topics, 10 times higher than the other words
                eta[topic, system] *= 10

                model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta)
                model.update(corpus)

                topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]

                # check that the word system in the topic we seeded, got a high weight,
                # and the word 'trees' (the main word in the other topic) a low weight --
                # and vice versa for the other topic (which we didn't seed with 'system')
                result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
                          [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
                expected = [[0.385, 0.022],
                            [0.025, 0.157]]
                passed = numpy.allclose(result, expected, atol=1e-2)
                if passed:
                    break
                logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                                (i, result, expected))
            self.assertTrue(passed) 
Example #23
Source File: lda_worker.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def initialize(self, myid, dispatcher, **model_params):
        self.lock_update = threading.Lock()
        self.jobsdone = 0 # how many jobs has this worker completed?
        self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s" % myid)
        self.model = ldamodel.LdaModel(**model_params) 
Example #24
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testPersistence(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)
        model.save(testfile())
        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 
Example #25
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testPersistence(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)
        model.save(testfile())
        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector 
Example #26
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testTopicSeeding(self):
        passed = False
        for topic in range(2):
            # try seeding it both ways round, check you get the same
            # topics out but with which way round they are depending
            # on the way round they're seeded
            for i in range(5): # restart at most 5 times

                eta = numpy.ones((2, len(dictionary))) * 0.5
                system = dictionary.token2id[u'system']
                trees = dictionary.token2id[u'trees']

                # aggressively seed the word 'system', in one of the
                # two topics, 10 times higher than the other words
                eta[topic, system] *= 10

                model = ldamodel.LdaModel(id2word=dictionary, num_topics=2, passes=200, eta=eta)
                model.update(corpus)

                topics = [dict((word, p) for p, word in model.show_topic(j)) for j in range(2)]

                # check that the word system in the topic we seeded, got a high weight,
                # and the word 'trees' (the main word in the other topic) a low weight --
                # and vice versa for the other topic (which we didn't seed with 'system')
                result = [[topics[topic].get(u'system',0), topics[topic].get(u'trees',0)],
                          [topics[1-topic].get(u'system',0), topics[1-topic].get(u'trees',0)]]
                expected = [[0.385, 0.022],
                            [0.025, 0.157]]
                passed = numpy.allclose(result, expected, atol=1e-2)
                if passed:
                    break
                logging.warning("LDA failed to converge on attempt %i (got %s, expected %s)" %
                                (i, result, expected))
            self.assertTrue(passed) 
Example #27
Source File: test_models.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def testLargeMmap(self):
        model = ldamodel.LdaModel(self.corpus, num_topics=2)

        # simulate storing large arrays separately
        model.save(testfile(), sep_limit=0)

        model2 = ldamodel.LdaModel.load(testfile())
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

        # test loading the large model arrays with mmap
        model2 = ldamodel.LdaModel.load(testfile(), mmap='r')
        self.assertEqual(model.num_topics, model2.num_topics)
        self.assertTrue(numpy.allclose(model.expElogbeta, model2.expElogbeta))
        tstvec = []
        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
#endclass TestLdaModel 
Example #28
Source File: lda_worker.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def initialize(self, myid, dispatcher, **model_params):
        self.lock_update = threading.Lock()
        self.jobsdone = 0 # how many jobs has this worker completed?
        self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s" % myid)
        self.model = ldamodel.LdaModel(**model_params) 
Example #29
Source File: lda_worker.py    From topical_word_embeddings with MIT License 5 votes vote down vote up
def initialize(self, myid, dispatcher, **model_params):
        self.lock_update = threading.Lock()
        self.jobsdone = 0 # how many jobs has this worker completed?
        self.myid = myid # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove?
        self.dispatcher = dispatcher
        self.finished = False
        logger.info("initializing worker #%s" % myid)
        self.model = ldamodel.LdaModel(**model_params) 
Example #30
Source File: lda_context_utils.py    From yelp with GNU Lesser General Public License v2.1 5 votes vote down vote up
def get_topic_distribution(record, lda_model, dictionary, minimum_probability,
                           sampling_method=None, max_words=None):
    """

    :type record: dict
    :type lda_model: LdaModel
    :type minimum_probability: float
    :param sampling_method: a float in the range [0,1] that
    indicates the proportion of text that should be sampled from the review.
    It can also take the string value of 'max', indicating that only the
    word with the highest probability from the topic will be sampled
     text. If None then all the review text is taken
    :param max_words: is the set of words with maximum probability for each
    contextual topic
    """
    # review_bow = [record[Constants.BOW_FIELD]]
    # review_bow =\
    #     sample_bag_of_words(review_bow, sampling_method, max_words)

    # corpus = dictionary.doc2bow(review_bow[0])
    corpus = record[Constants.CORPUS_FIELD]
    lda_corpus = lda_model.get_document_topics(
        corpus, minimum_probability=minimum_probability)

    topic_distribution = numpy.zeros(lda_model.num_topics)
    for pair in lda_corpus:
        topic_distribution[pair[0]] = pair[1]

    return topic_distribution