python source code of SampleCorpora

scattertext-master
- demo_sklearn.py
- demo_general_inquirer.py
- demo_names.py
- demo_tsne_style_for_publication.py
- demo_embeddings_pca.py
- demo_insignificant_greyed_out.py
- demo_custom_coordinates.py
- simple.py
- demo_moral_foundations.py
- demo_dense_rank_difference.py
- demo_pair_plot_movies_doc2vec.py
- demo_two_axis.py
- scattertext
  - frequencyreaders
    - __init__.py
    - DefaultBackgroundFrequencies.py
  - characteristic
    - __init__.py
    - DenseRankCharacteristicness.py
  - diachronic
    - BubbleDiachronicVisualization.py
    - DiachronicTermMiner.py
    - DiachronicPairPlot.py
    - GanttChart.py
    - DiachronicVisualizer.py
    - __init__.py
    - TimeStructure.py
  - distancemeasures
    - DistanceMeasureBase.py
    - EuclideanDistance.py
    - __init__.py
  - TermDocMatrixFactory.py
  - DocsAndLabelsFromCorpus.py
  - graphs
    - ComponentDiGraph.py
    - GraphStructure.py
    - __init__.py
    - SimpleDiGraph.py
    - ComponentDiGraphHTMLRenderer.py
  - termscoring
    - ZScores.py
    - RankDifference.py
    - LogOddsUniformativePriorScore.py
    - CornerScore.py
    - CredTFIDF.py
    - ScaledFScore.py
    - CorpusBasedTermScorer.py
    - BM25Difference.py
    - RelativeEntropy.py
    - CohensDCalculator.py
    - MannWhitneyU.py
    - OLSUngnarStyle.py
    - __init__.py
    - test_credTFIDF.py
    - BetaPosterior.py
    - CohensD.py
  - categoryprojector
    - pairplot.py
    - CategoryProjectorEvaluator.py
    - OptimalProjection.py
    - CategoryProjection.py
    - __init__.py
    - CategoryProjector.py
  - PValGetter.py
  - CorpusFromParsedDocuments.py
  - representations
    - EmbeddingsResolver.py
    - GensimPhraseAugmenter.py
    - Doc2VecBuilder.py
    - CorpusSentenceIterator.py
    - __init__.py
    - Word2VecFromParsedCorpus.py
    - CategoryEmbeddings.py
  - Scalers.py
  - CorpusFromScikit.py
  - viz
    - VizDataAdapter.py
    - BasicHTMLFromScatterplotStructure.py
    - PairPlotFromScattertextStructure.py
    - __init__.py
    - HTMLSemioticSquareViz.py
    - ScatterplotStructure.py
  - Common.py
  - ScatterChartData.py
  - CorpusFromFeatureDict.py
  - TermDocMatrixFilter.py
  - topicmodel
    - interface
      - __init__.py
    - SentencesForTopicModeling.py
    - __init__.py
  - TermDocMatrixFromScikit.py
  - CategoryColorAssigner.py
  - test
    - test_CorpusFromParsedDocuments.py
    - test_featsFromScoredLexicon.py
    - test_autoTermSelector.py
    - test_indexStoreFromDict.py
    - test_CategoryColorAssigner.py
    - test_Scalers.py
    - test_termDocMatrixFromPandas.py
    - test_betaPosterior.py
    - test_termDocMatrixFactory.py
    - test_gensimPhraseAdder.py
    - test_scatterChartExplorer.py
    - test_diachronicTermMiner.py
    - test_WhitespaceNLP.py
    - test_useFullDocAsFeature.py
    - test_semioticSquareFromAxes.py
    - test_featureLister.py
    - test_semioticSquare.py
    - test_logOddsUninformativePriorScore.py
    - test_phraseSelector.py
    - test_HTMLVisualizationAssembly.py
    - test_fourSquareAxes.py
    - test_featsFromSpacyDocAndEmpath.py
    - test_termCategoryFrequencies.py
    - test_percentile_lexicographic.py
    - test_CSRMatrixTools.py
    - test_classPercentageCompactor.py
    - test_PriorFactory.py
    - test_embeddingsResolver.py
    - test_domainCompactor.py
    - test_PMIFiltering.py
    - test_extract_emoji.py
    - test_denseRankCharacteristicness.py
    - test_scaledFScore.py
    - test_vizDataAdapter.py
    - test_unigramsFromSpacyDoc.py
    - test_ZScores.py
    - test_cohensD.py
    - test_termRanker.py
    - test_corpusFromPandas.py
    - test_word2VecFromParsedCorpus.py
    - test_ParsedCorpus.py
    - test_combineDocsIntoDomains.py
    - test_corpusFromScikit.py
    - test_compactTerms.py
    - __init__.py
    - test_HTMLSemioticSquareViz.py
    - test_scatterChart.py
    - test_FeatsFromSpacyDoc.py
    - test_BM25Difference.py
    - test_corpusFromPandasWithoutCategories.py
    - test_credTFIDF.py
    - test_termDocMatrixFromScikit.py
    - test_logOddsRatioUninformativeDirichletPrior.py
    - test_large_int_format.py
    - test_useFullDocAsMetadata.py
    - test_associationCompator.py
    - test_CornerScore.py
    - test_asiannlp.py
    - test_oneClassScatterChart.py
    - test_TermDocMat.py
    - test_relativeEntropy.py
    - test_corpusFromFeatureDict.py
    - test_termDocMatrixFromFrequencies.py
    - test_docsAndLabelsFromCorpus.py
    - test_indexStore.py
    - test_indexStoreFromList.py
  - CorpusDF.py
  - termsignificance
    - LogOddsRatioUninformativeDirichletPrior.py
    - ScaledFScoreSignificance.py
    - __init__.py
    - LogOddsRatioSmoothed.py
    - LogOddsRatioInformativeDirichletPiror.py
    - TermSignificance.py
  - SampleCorpora.py
  - WhitespaceNLP.py
  - Formatter.py
  - CLI.py
  - external
    - phrasemachine
      - phrasemachine.py
      - __init__.py
    - __init__.py
  - termranking
    - DocLengthNormalizedFrequencyRanker.py
    - OncePerDocFrequencyRanker.py
    - DocLengthDividedFrequencyRanker.py
    - __init__.py
    - AbsoluteFrequencyRanker.py
    - TermRanker.py
  - PriorFactory.py
  - Corpus.py
  - ScatterChart.py
  - TermDocMatrixFromFrequencies.py
  - CorpusFromPandas.py
  - TermDocMatrixWithoutCategories.py
  - indexstore
    - IndexStoreFromDict.py
    - IndexStoreFromList.py
    - __init__.py
    - IndexStore.py
  - TermDocMatrixFromPandas.py
  - TermDocMatrix.py
  - data
    - mfd2.0.csv
    - viz
      - time_plot.html
      - semiotic_new.html
      - pairplot.html
      - scattertext.html
      - autocomplete.css
      - pairplot_without_halo.html
      - scripts
        range-tree.js
        rectangle-holder.js
        main.js
        d3-scale-chromatic.v1.min.js
        autocomplete_call.js
        autocomplete_definition.js
      - semiotic.html
      - search_form.html
      - graph_plot.html
    - hamlet.txt
    - presidential_debates_2016.csv.gz
  - CSRMatrixTools.py
  - OneClassScatterChart.py
  - __init__.py
  - SampleLexicons.py
  - FeatureOuput.py
  - features
    - PhraseMachinePhrases.py
    - UseFullDocAsMetadata.py
    - FeatsFromOnlyEmpath.py
    - SpacyEntities.py
    - FeatsFromMoralFoundationsDictionary.py
    - PyTextRankPhrases.py
    - FeatsFromScoredLexicon.py
    - FeatsFromSpacyDoc.py
    - FeatsFromSpacyDocOnlyEmoji.py
    - FeatsFromSpacyDocAndEmpath.py
    - UnigramsFromSpacyDoc.py
    - FeatsFromGeneralInquirer.py
    - __init__.py
    - UseFullDocAsFeature.py
    - FeastFromSentencePiece.py
    - FeatsFromSpacyDocOnlyNounChunks.py
    - FeatsFromTopicModel.py
  - domain
    - __init__.py
    - CombineDocsIntoDomains.py
  - ParsedCorpus.py
  - ScatterChartExplorer.py
  - AutoTermSelector.py
  - termcompaction
    - CompactTerms.py
    - ClassPercentageCompactor.py
    - DomainCompactor.py
    - PhraseSelector.py
    - ScikitCompactor.py
    - __init__.py
    - AssociationCompactor.py
  - semioticsquare
    - FourSquare.py
    - FourSquareAxis.py
    - SemioticSquareFromAxes.py
    - __init__.py
    - SemioticSquare.py
  - emojis
    - EmojiExtractor.py
    - __init__.py
    - ProcessedEmojiStructure.py
  - TermCategoryFrequencies.py
  - AsianNLP.py
  - DeployedClassifier.py
- demo_four_square.py
- demo_flashtext.py
- demo_gensim_similarity.py
- demo_focused_pair_plot_movies.py
- demo_unified_context.py
- demo_pair_plot_convention_geninq.py
- demo_pair_plot_convention_empath.py
- demo_log_odds_ratio_prior.py
- demo_semiotic.py
- demo_category_frequencies.py
- demo_without_spacy.py
- ISSUE_TEMPLATE
- demo_emoji.py
- demo_chinese.py
- LICENSE
- demo_obama.py
- demo_pair_plot_movies.py
- demo_relative_entropy.py
- demo.py
- demo_beta_posterior.py
- demo_general_inquirer_frequency_plot.py
- demo_pair_plot_movies_umap.py
- demo_tsne_style.py
- demo_tfidf.py
- demo_bm25.py
- demo_japanese.py
- demo_scaled_f_score.py
- PhraseMachineLicense.txt
- .gitattributes
- demo_mann_whitney.py
- demo_pair_plot_20_newsgroups.py
- demo_alt_tokenization.py
- setup.py
- distribution.sh
- demo_multi_category_pca.py
- demo_pca_documents.py
- demo_compact.py
- demo_pair_plot_convention.py
- regendocs.sh
- demo_bow_pca.py
- demo_pytextrank.py
- demo_dense_rank.py
- demo_axis_crossbars_and_labels.py
- demo_cohens_d.py
- demo_pair_plot_movies_mirror.py
- demo_feature_importance.py
- .travis.yml
- README.md
- demo_expected_vs_actual.py
- demo_nmf_topic_model.py
- demo_include_all_contexts.py
- demo_similarity.py
- demo_compact_suppress_documents.py
- demo_characteristic_chart.py
- demo_phrase_machine.py
- demo_sentence_piece.py
- demo_empath.py
- demo_z_scores.py
- demo_word_list_topic_model.py
- demo_umap_documents.py
- demo_pair_plot_movies_phate.py
- demo_custom_topic_model.py
- demo_sparse.py
- demo_pair_plot_category_focused.py
- demo_cred_tfidf.py
- demo_hedges_r.py

# Helper functions for loading political convention data set
import bz2
import io
import json
import pkgutil
import re
import sys

from scattertext.Common import POLITICAL_DATA_URL, ROTTEN_TOMATOES_DATA_URL

if sys.version_info[0] >= 3:
	from urllib.request import urlopen
else:
	from urllib2 import urlopen

import pandas as pd


class ConventionData2012(object):
	@staticmethod
	def _speaker_name_factory():
		name_re = re.compile(r'.*(\n|^)(?P<name>[A-Z0-9 \.\']+):\w*.+', re.M)

		def speaker_name(text):
			for _, name in name_re.findall(text):
				if name not in ('ANNOUNCER', 'AUDIENCE MEMBER', 'AUDIENCE MEMBERS'):
					return name

		return speaker_name

	@staticmethod
	def _clean_function_factory():
		only_speaker_text_re = re.compile(
			r'((^|\n)((ANNOUNCER|AUDIENCE MEMBERS?): .+)($|\n)|(\n|^)((([A-Z\.()\-\' ]+): ))|\(.+\) *)',
			re.M)
		assert only_speaker_text_re.sub('', 'AUDIENCE MEMBERS: (Chanting.) USA! USA! USA! USA!') == ''
		assert only_speaker_text_re.sub('', 'AUDIENCE MEMBER: (Chanting.) USA! USA! USA! USA!') == ''
		assert only_speaker_text_re.sub('', 'ANNOUNCER: (Chanting.) USA! USA! USA! USA!') == ''
		assert only_speaker_text_re.sub('', 'TOM SMITH: (Chanting.) USA! USA! USA! USA!') == 'USA! USA! USA! USA!'
		assert only_speaker_text_re.sub('', 'DONALD TRUMP: blah blah blah!') == 'blah blah blah!'
		assert only_speaker_text_re.sub('',
		                                'HILLARY CLINTON: (something parenthetical) blah blah blah!') == 'blah blah blah!'
		assert only_speaker_text_re.sub \
			       ('',
			        'ANNOUNCER: (Chanting.) USA! USA! USA! USA!\nTOM SMITH: (Chanting.) ONLY INCLUDE THIS! ONLY KEEP THIS! \nAUDIENCE MEMBER: (Chanting.) USA! USA! USA! USA!').strip() \
		       == 'ONLY INCLUDE THIS! ONLY KEEP THIS!'

		def clean_document(text):
			return only_speaker_text_re.sub('', text)

		return clean_document

	@staticmethod
	def _convention_speech_iter():
		try:
			data_stream = pkgutil.get_data('scattertext', 'data/political_data.json').decode('utf-8')
		except:
			url = POLITICAL_DATA_URL
			data_stream = urlopen(url).read().decode('utf-8')
		return json.loads(data_stream)

	@staticmethod
	def _iter_party_speech_pairs():
		for speaker_obj in ConventionData2012._convention_speech_iter():
			political_party = speaker_obj['name']
			for speech in speaker_obj['speeches']:
				yield political_party, speech

	@staticmethod
	def get_data():
		clean = ConventionData2012._clean_function_factory()
		get_speaker_name = ConventionData2012._speaker_name_factory()
		data = []
		for party, speech in ConventionData2012._iter_party_speech_pairs():
			cleaned_speech = clean(speech)
			speaker_name = get_speaker_name(speech)
			if cleaned_speech and cleaned_speech != '' and speaker_name != '':
				data.append({'party': party,
				             'text': cleaned_speech,
				             'speaker': speaker_name})
		return pd.DataFrame(data)


class RottenTomatoes(object):
	'''
	Derived from the sentiment polarity/subjectivity datasets from
	http://www.cs.cornell.edu/people/pabo/movie-review-data/

	Bo Pang and Lillian Lee. ``A Sentimental Education: Sentiment Analysis Using Subjectivity
	 Summarization Based on Minimum Cuts'', Proceedings of the ACL, 2004.
	'''

	@staticmethod
	def get_data():
		'''
		Returns
		-------
		pd.DataFrame

		I.e.,
		>>> convention_df.iloc[0]
		category                                                    plot
		filename                 subjectivity_html/obj/2002/Abandon.html
		text           A senior at an elite college (Katie Holmes), a...
		movie_name                                               abandon
		'''
		try:
			data_stream = pkgutil.get_data('scattertext', 'data/rotten_tomatoes_corpus.csv.bz2')
		except:
			url = ROTTEN_TOMATOES_DATA_URL
			data_stream = urlopen(url).read()
		return pd.read_csv(io.BytesIO(bz2.decompress(data_stream)))

	@staticmethod
	def get_full_data():
		'''
		Returns all plots and reviews, not just the ones that appear in movies with both plot descriptions and reviews.

		Returns
		-------
		pd.DataFrame

		I.e.,
		>>> convention_df.iloc[0]
		category                                                             plot
		text                    Vijay Singh Rajput (Amitabh Bachchan) is a qui...
		movie_name                                                        aankhen
		has_plot_and_reviews                                                False
		Name: 0, dtype: object
		'''
		try:
			data_stream = pkgutil.get_data('scattertext', 'data/rotten_tomatoes_corpus_full.csv.bz2')
		except:
			url = ROTTEN_TOMATOES_DATA_URL
			data_stream = urlopen(url).read()
		return pd.read_csv(io.BytesIO(bz2.decompress(data_stream)))