Python sklearn.decomposition.LatentDirichletAllocation() Examples

The following are 30 code examples of sklearn.decomposition.LatentDirichletAllocation(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.decomposition , or try the search function .
Example #1
Source File: test_decomposition.py    From pandas-ml with BSD 3-Clause "New" or "Revised" License 7 votes vote down vote up
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.decomposition.PCA, decomposition.PCA)
        self.assertIs(df.decomposition.IncrementalPCA,
                      decomposition.IncrementalPCA)
        self.assertIs(df.decomposition.KernelPCA, decomposition.KernelPCA)
        self.assertIs(df.decomposition.FactorAnalysis,
                      decomposition.FactorAnalysis)
        self.assertIs(df.decomposition.FastICA, decomposition.FastICA)
        self.assertIs(df.decomposition.TruncatedSVD, decomposition.TruncatedSVD)
        self.assertIs(df.decomposition.NMF, decomposition.NMF)
        self.assertIs(df.decomposition.SparsePCA, decomposition.SparsePCA)
        self.assertIs(df.decomposition.MiniBatchSparsePCA,
                      decomposition.MiniBatchSparsePCA)
        self.assertIs(df.decomposition.SparseCoder, decomposition.SparseCoder)
        self.assertIs(df.decomposition.DictionaryLearning,
                      decomposition.DictionaryLearning)
        self.assertIs(df.decomposition.MiniBatchDictionaryLearning,
                      decomposition.MiniBatchDictionaryLearning)

        self.assertIs(df.decomposition.LatentDirichletAllocation,
                      decomposition.LatentDirichletAllocation) 
Example #2
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def check_verbosity(verbose, evaluate_every, expected_lines,
                    expected_perplexities):
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
                                    learning_method='batch',
                                    verbose=verbose,
                                    evaluate_every=evaluate_every,
                                    random_state=0)
    out = StringIO()
    old_out, sys.stdout = sys.stdout, out
    try:
        lda.fit(X)
    finally:
        sys.stdout = old_out

    n_lines = out.getvalue().count('\n')
    n_perplexity = out.getvalue().count('perplexity')
    assert_equal(expected_lines, n_lines)
    assert_equal(expected_perplexities, n_perplexity) 
Example #3
Source File: test_online_lda.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch', random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2) 
Example #4
Source File: test_online_lda.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_lda_score(method):
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit_transform(X)
    score_1 = lda_1.score(X)

    lda_2.fit_transform(X)
    score_2 = lda_2.score(X)
    assert_greater_equal(score_2, score_1) 
Example #5
Source File: test_online_lda.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_lda_perplexity(method):
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit(X)
    perp_1 = lda_1.perplexity(X, sub_sampling=False)

    lda_2.fit(X)
    perp_2 = lda_2.perplexity(X, sub_sampling=False)
    assert_greater_equal(perp_1, perp_2)

    perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
    perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
    assert_greater_equal(perp_1_subsampling, perp_2_subsampling) 
Example #6
Source File: test_online_lda.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_components = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(n_components=n_components,
                                    learning_offset=5., total_samples=20,
                                    random_state=rng)
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
    assert_raises_regexp(ValueError, r'Number of samples',
                         lda._perplexity_precomp_distr, X, invalid_n_samples)
    # invalid topic number
    invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
    assert_raises_regexp(ValueError, r'Number of topics',
                         lda._perplexity_precomp_distr, X,
                         invalid_n_components) 
Example #7
Source File: test_online_lda.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def check_verbosity(verbose, evaluate_every, expected_lines,
                    expected_perplexities):
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
                                    learning_method='batch',
                                    verbose=verbose,
                                    evaluate_every=evaluate_every,
                                    random_state=0)
    out = StringIO()
    old_out, sys.stdout = sys.stdout, out
    try:
        lda.fit(X)
    finally:
        sys.stdout = old_out

    n_lines = out.getvalue().count('\n')
    n_perplexity = out.getvalue().count('perplexity')
    assert_equal(expected_lines, n_lines)
    assert_equal(expected_perplexities, n_perplexity) 
Example #8
Source File: topic.py    From Python-DevOps with MIT License 6 votes vote down vote up
def train_lda(corpus, n_topics=10, max_df=0.95, min_df=2,
              cleaning=clearstring, stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    tf_vectorizer = CountVectorizer(
        max_df=max_df,
        min_df=min_df,
        stop_words=stop_words)
    tf = tf_vectorizer.fit_transform(corpus)
    tf_features = tf_vectorizer.get_feature_names()
    lda = LatentDirichletAllocation(
        n_topics=n_topics,
        max_iter=5,
        learning_method='online',
        learning_offset=50.,
        random_state=0).fit(tf)
    return TOPIC(tf_features, lda) 
Example #9
Source File: topics.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(tokenizer=identity,
                                      preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ]) 
Example #10
Source File: LDA_Analysis.py    From Spider with MIT License 6 votes vote down vote up
def word2vec(word_list,n_features=1000,topics = 5):
    tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                    max_features=n_features,
                                    #stop_words='english',
                                    max_df=0.5,
                                    min_df=10)
    tf = tf_vectorizer.fit_transform(word_list)

    lda = LatentDirichletAllocation(n_components=topics,#主题数
                                    learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调
                                    )
    #用变分贝叶斯方法训练模型
    lda.fit(tf)

    #依次输出每个主题的关键词表
    tf_feature_names = tf_vectorizer.get_feature_names()

    return lda,tf,tf_feature_names,tf_vectorizer

#将主题以可视化结果展现出来 
Example #11
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_components = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(n_components=n_components,
                                    learning_offset=5., total_samples=20,
                                    random_state=rng)
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
    assert_raises_regexp(ValueError, r'Number of samples',
                         lda._perplexity_precomp_distr, X, invalid_n_samples)
    # invalid topic number
    invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
    assert_raises_regexp(ValueError, r'Number of topics',
                         lda._perplexity_precomp_distr, X,
                         invalid_n_components) 
Example #12
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit(X)
        perp_1 = lda_1.perplexity(X, sub_sampling=False)

        lda_2.fit(X)
        perp_2 = lda_2.perplexity(X, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling) 
Example #13
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1) 
Example #14
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 6 votes vote down vote up
def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch', random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2) 
Example #15
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_empty_docs():
    """Test LDA on empty document (all-zero rows)."""
    Z = np.zeros((5, 4))
    for X in [Z, csr_matrix(Z)]:
        lda = LatentDirichletAllocation(max_iter=750).fit(X)
        assert_almost_equal(lda.components_.sum(axis=0),
                            np.ones(lda.components_.shape[1])) 
Example #16
Source File: test_text2mat.py    From hypertools with MIT License 5 votes vote down vote up
def test_LDA_class_instance():
    user_model = LatentDirichletAllocation(n_components=15)
    assert text2mat(data, semantic=user_model, corpus=data)[0].shape[1]==15 
Example #17
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_doc_topic_distr_deprecation():
    # Test that the appropriate warning message is displayed when a user
    # attempts to pass the doc_topic_distr argument to the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    distr1 = lda.fit_transform(X)
    distr2 = None
    assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
    assert_warns(DeprecationWarning, lda.perplexity, X, distr2) 
Example #18
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_negative_input():
    # test pass dense matrix with sparse negative input.
    X = -np.ones((5, 10))
    lda = LatentDirichletAllocation()
    regex = r"^Negative values in data passed"
    assert_raises_regexp(ValueError, regex, lda.fit, X) 
Example #19
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_partial_fit_multi_jobs():
    # Test LDA online training with multi CPU
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
                                    learning_offset=5., total_samples=30,
                                    random_state=rng)
    for i in range(2):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps) 
Example #20
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_multi_jobs():
    n_components, X = _build_sparse_mtx()
    # Test LDA batch training with multi CPU
    for method in ('online', 'batch'):
        rng = np.random.RandomState(0)
        lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
                                        learning_method=method,
                                        evaluate_every=1, random_state=rng)
        lda.fit(X)

        correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
        for c in lda.components_:
            top_idx = set(c.argsort()[-3:][::-1])
            assert_true(tuple(sorted(top_idx)) in correct_idx_grps) 
Example #21
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_no_component_error():
    # test `transform` and `perplexity` before `fit`
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    lda = LatentDirichletAllocation()
    regex = r"^no 'components_' attribute"
    assert_raises_regexp(NotFittedError, regex, lda.transform, X)
    assert_raises_regexp(NotFittedError, regex, lda.perplexity, X) 
Example #22
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2) 
Example #23
Source File: test_text2mat.py    From hypertools with MIT License 5 votes vote down vote up
def test_tfidf_LDA():
    isinstance(text2mat(data, vectorizer='TfidfVectorizer',
                        semantic='LatentDirichletAllocation', corpus=data)[0], np.ndarray) 
Example #24
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def transformer_factory(self):
        return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71) 
Example #25
Source File: test_text2mat.py    From hypertools with MIT License 5 votes vote down vote up
def test_text_model_params():
    assert isinstance(text2mat(data, semantic={
        'model' : 'LatentDirichletAllocation',
        'params' : {
            'learning_method' : 'batch'
            }}
        , corpus=data)[0], np.ndarray) 
Example #26
Source File: test_online_lda.py    From twitter-stock-recommendation with MIT License 5 votes vote down vote up
def test_lda_default_prior_params():
    # default prior parameter should be `1 / topics`
    # and verbose params should not affect result
    n_components, X = _build_sparse_mtx()
    prior = 1. / n_components
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      doc_topic_prior=prior,
                                      topic_word_prior=prior, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      random_state=0)
    topic_distr_1 = lda_1.fit_transform(X)
    topic_distr_2 = lda_2.fit_transform(X)
    assert_almost_equal(topic_distr_1, topic_distr_2) 
Example #27
Source File: test_text2mat.py    From hypertools with MIT License 5 votes vote down vote up
def test_LDA_class():
    assert text2mat(data, semantic=LatentDirichletAllocation, corpus=data)[0].shape[1]==10 
Example #28
Source File: topic.py    From Python-DevOps with MIT License 5 votes vote down vote up
def train_lda(corpus,n_topics=10, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)): corpus[i] = cleaning(corpus[i])
    tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, stop_words=stop_words)
    tf = tf_vectorizer.fit_transform(corpus)
    tf_features = tf_vectorizer.get_feature_names()
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter = 5, learning_method = 'online', learning_offset=50., random_state=0).fit(tf)
    return TOPIC(tf_features,lda) 
Example #29
Source File: LatentDirichletAllocation.py    From mltk-algo-contrib with Apache License 2.0 5 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)
        out_params = convert_params(
            options.get('params', {}),
            floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'],
            strs=['learning_method'],
            ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'],
            aliases={'k': 'n_topics'}
        )

        self.estimator = _LatentDirichletAllocation(**out_params) 
Example #30
Source File: build_lda_model.py    From altair with Apache License 2.0 5 votes vote down vote up
def build_lda_model(code_scripts_list,topics,vocab,use_binary=False,n_jobs=1):

    # Vectorize the python scripts with bag of words
    bow_model = CountVectorizer(analyzer="word", vocabulary=vocab, binary=use_binary)
    bow_vector_values = bow_model.transform(code_scripts_list).toarray()

    # Train/Fit LDA
    lda_model = LatentDirichletAllocation(n_topics=topics,learning_method="online",random_state=0,n_jobs=1)
    lda_model.fit(bow_vector_values)

    return lda_model