python source code of test_termDocMatrixFromScikit

scattertext-master
- demo_sklearn.py
- demo_general_inquirer.py
- demo_names.py
- demo_tsne_style_for_publication.py
- demo_embeddings_pca.py
- demo_insignificant_greyed_out.py
- demo_custom_coordinates.py
- simple.py
- demo_moral_foundations.py
- demo_dense_rank_difference.py
- demo_pair_plot_movies_doc2vec.py
- demo_two_axis.py
- scattertext
  - frequencyreaders
    - __init__.py
    - DefaultBackgroundFrequencies.py
  - characteristic
    - __init__.py
    - DenseRankCharacteristicness.py
  - diachronic
    - BubbleDiachronicVisualization.py
    - DiachronicTermMiner.py
    - DiachronicPairPlot.py
    - GanttChart.py
    - DiachronicVisualizer.py
    - __init__.py
    - TimeStructure.py
  - distancemeasures
    - DistanceMeasureBase.py
    - EuclideanDistance.py
    - __init__.py
  - TermDocMatrixFactory.py
  - DocsAndLabelsFromCorpus.py
  - graphs
    - ComponentDiGraph.py
    - GraphStructure.py
    - __init__.py
    - SimpleDiGraph.py
    - ComponentDiGraphHTMLRenderer.py
  - termscoring
    - ZScores.py
    - RankDifference.py
    - LogOddsUniformativePriorScore.py
    - CornerScore.py
    - CredTFIDF.py
    - ScaledFScore.py
    - CorpusBasedTermScorer.py
    - BM25Difference.py
    - RelativeEntropy.py
    - CohensDCalculator.py
    - MannWhitneyU.py
    - OLSUngnarStyle.py
    - __init__.py
    - test_credTFIDF.py
    - BetaPosterior.py
    - CohensD.py
  - categoryprojector
    - pairplot.py
    - CategoryProjectorEvaluator.py
    - OptimalProjection.py
    - CategoryProjection.py
    - __init__.py
    - CategoryProjector.py
  - PValGetter.py
  - CorpusFromParsedDocuments.py
  - representations
    - EmbeddingsResolver.py
    - GensimPhraseAugmenter.py
    - Doc2VecBuilder.py
    - CorpusSentenceIterator.py
    - __init__.py
    - Word2VecFromParsedCorpus.py
    - CategoryEmbeddings.py
  - Scalers.py
  - CorpusFromScikit.py
  - viz
    - VizDataAdapter.py
    - BasicHTMLFromScatterplotStructure.py
    - PairPlotFromScattertextStructure.py
    - __init__.py
    - HTMLSemioticSquareViz.py
    - ScatterplotStructure.py
  - Common.py
  - ScatterChartData.py
  - CorpusFromFeatureDict.py
  - TermDocMatrixFilter.py
  - topicmodel
    - interface
      - __init__.py
    - SentencesForTopicModeling.py
    - __init__.py
  - TermDocMatrixFromScikit.py
  - CategoryColorAssigner.py
  - test
    - test_CorpusFromParsedDocuments.py
    - test_featsFromScoredLexicon.py
    - test_autoTermSelector.py
    - test_indexStoreFromDict.py
    - test_CategoryColorAssigner.py
    - test_Scalers.py
    - test_termDocMatrixFromPandas.py
    - test_betaPosterior.py
    - test_termDocMatrixFactory.py
    - test_gensimPhraseAdder.py
    - test_scatterChartExplorer.py
    - test_diachronicTermMiner.py
    - test_WhitespaceNLP.py
    - test_useFullDocAsFeature.py
    - test_semioticSquareFromAxes.py
    - test_featureLister.py
    - test_semioticSquare.py
    - test_logOddsUninformativePriorScore.py
    - test_phraseSelector.py
    - test_HTMLVisualizationAssembly.py
    - test_fourSquareAxes.py
    - test_featsFromSpacyDocAndEmpath.py
    - test_termCategoryFrequencies.py
    - test_percentile_lexicographic.py
    - test_CSRMatrixTools.py
    - test_classPercentageCompactor.py
    - test_PriorFactory.py
    - test_embeddingsResolver.py
    - test_domainCompactor.py
    - test_PMIFiltering.py
    - test_extract_emoji.py
    - test_denseRankCharacteristicness.py
    - test_scaledFScore.py
    - test_vizDataAdapter.py
    - test_unigramsFromSpacyDoc.py
    - test_ZScores.py
    - test_cohensD.py
    - test_termRanker.py
    - test_corpusFromPandas.py
    - test_word2VecFromParsedCorpus.py
    - test_ParsedCorpus.py
    - test_combineDocsIntoDomains.py
    - test_corpusFromScikit.py
    - test_compactTerms.py
    - __init__.py
    - test_HTMLSemioticSquareViz.py
    - test_scatterChart.py
    - test_FeatsFromSpacyDoc.py
    - test_BM25Difference.py
    - test_corpusFromPandasWithoutCategories.py
    - test_credTFIDF.py
    - test_termDocMatrixFromScikit.py
    - test_logOddsRatioUninformativeDirichletPrior.py
    - test_large_int_format.py
    - test_useFullDocAsMetadata.py
    - test_associationCompator.py
    - test_CornerScore.py
    - test_asiannlp.py
    - test_oneClassScatterChart.py
    - test_TermDocMat.py
    - test_relativeEntropy.py
    - test_corpusFromFeatureDict.py
    - test_termDocMatrixFromFrequencies.py
    - test_docsAndLabelsFromCorpus.py
    - test_indexStore.py
    - test_indexStoreFromList.py
  - CorpusDF.py
  - termsignificance
    - LogOddsRatioUninformativeDirichletPrior.py
    - ScaledFScoreSignificance.py
    - __init__.py
    - LogOddsRatioSmoothed.py
    - LogOddsRatioInformativeDirichletPiror.py
    - TermSignificance.py
  - SampleCorpora.py
  - WhitespaceNLP.py
  - Formatter.py
  - CLI.py
  - external
    - phrasemachine
      - phrasemachine.py
      - __init__.py
    - __init__.py
  - termranking
    - DocLengthNormalizedFrequencyRanker.py
    - OncePerDocFrequencyRanker.py
    - DocLengthDividedFrequencyRanker.py
    - __init__.py
    - AbsoluteFrequencyRanker.py
    - TermRanker.py
  - PriorFactory.py
  - Corpus.py
  - ScatterChart.py
  - TermDocMatrixFromFrequencies.py
  - CorpusFromPandas.py
  - TermDocMatrixWithoutCategories.py
  - indexstore
    - IndexStoreFromDict.py
    - IndexStoreFromList.py
    - __init__.py
    - IndexStore.py
  - TermDocMatrixFromPandas.py
  - TermDocMatrix.py
  - data
    - mfd2.0.csv
    - viz
      - time_plot.html
      - semiotic_new.html
      - pairplot.html
      - scattertext.html
      - autocomplete.css
      - pairplot_without_halo.html
      - scripts
        range-tree.js
        rectangle-holder.js
        main.js
        d3-scale-chromatic.v1.min.js
        autocomplete_call.js
        autocomplete_definition.js
      - semiotic.html
      - search_form.html
      - graph_plot.html
    - hamlet.txt
    - presidential_debates_2016.csv.gz
  - CSRMatrixTools.py
  - OneClassScatterChart.py
  - __init__.py
  - SampleLexicons.py
  - FeatureOuput.py
  - features
    - PhraseMachinePhrases.py
    - UseFullDocAsMetadata.py
    - FeatsFromOnlyEmpath.py
    - SpacyEntities.py
    - FeatsFromMoralFoundationsDictionary.py
    - PyTextRankPhrases.py
    - FeatsFromScoredLexicon.py
    - FeatsFromSpacyDoc.py
    - FeatsFromSpacyDocOnlyEmoji.py
    - FeatsFromSpacyDocAndEmpath.py
    - UnigramsFromSpacyDoc.py
    - FeatsFromGeneralInquirer.py
    - __init__.py
    - UseFullDocAsFeature.py
    - FeastFromSentencePiece.py
    - FeatsFromSpacyDocOnlyNounChunks.py
    - FeatsFromTopicModel.py
  - domain
    - __init__.py
    - CombineDocsIntoDomains.py
  - ParsedCorpus.py
  - ScatterChartExplorer.py
  - AutoTermSelector.py
  - termcompaction
    - CompactTerms.py
    - ClassPercentageCompactor.py
    - DomainCompactor.py
    - PhraseSelector.py
    - ScikitCompactor.py
    - __init__.py
    - AssociationCompactor.py
  - semioticsquare
    - FourSquare.py
    - FourSquareAxis.py
    - SemioticSquareFromAxes.py
    - __init__.py
    - SemioticSquare.py
  - emojis
    - EmojiExtractor.py
    - __init__.py
    - ProcessedEmojiStructure.py
  - TermCategoryFrequencies.py
  - AsianNLP.py
  - DeployedClassifier.py
- demo_four_square.py
- demo_flashtext.py
- demo_gensim_similarity.py
- demo_focused_pair_plot_movies.py
- demo_unified_context.py
- demo_pair_plot_convention_geninq.py
- demo_pair_plot_convention_empath.py
- demo_log_odds_ratio_prior.py
- demo_semiotic.py
- demo_category_frequencies.py
- demo_without_spacy.py
- ISSUE_TEMPLATE
- demo_emoji.py
- demo_chinese.py
- LICENSE
- demo_obama.py
- demo_pair_plot_movies.py
- demo_relative_entropy.py
- demo.py
- demo_beta_posterior.py
- demo_general_inquirer_frequency_plot.py
- demo_pair_plot_movies_umap.py
- demo_tsne_style.py
- demo_tfidf.py
- demo_bm25.py
- demo_japanese.py
- demo_scaled_f_score.py
- PhraseMachineLicense.txt
- .gitattributes
- demo_mann_whitney.py
- demo_pair_plot_20_newsgroups.py
- demo_alt_tokenization.py
- setup.py
- distribution.sh
- demo_multi_category_pca.py
- demo_pca_documents.py
- demo_compact.py
- demo_pair_plot_convention.py
- regendocs.sh
- demo_bow_pca.py
- demo_pytextrank.py
- demo_dense_rank.py
- demo_axis_crossbars_and_labels.py
- demo_cohens_d.py
- demo_pair_plot_movies_mirror.py
- demo_feature_importance.py
- .travis.yml
- README.md
- demo_expected_vs_actual.py
- demo_nmf_topic_model.py
- demo_include_all_contexts.py
- demo_similarity.py
- demo_compact_suppress_documents.py
- demo_characteristic_chart.py
- demo_phrase_machine.py
- demo_sentence_piece.py
- demo_empath.py
- demo_z_scores.py
- demo_word_list_topic_model.py
- demo_umap_documents.py
- demo_pair_plot_movies_phate.py
- demo_custom_topic_model.py
- demo_sparse.py
- demo_pair_plot_category_focused.py
- demo_cred_tfidf.py
- demo_hedges_r.py

from unittest import TestCase

import numpy as np

from scattertext import TermDocMatrixFromScikit
from scattertext.indexstore import IndexStore
from scattertext.test.test_semioticSquare import get_docs_categories_semiotic


class TestTermDocMatrixFromScikit(TestCase):
	def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])