# Helper functions for loading political convention data set import bz2 import io import json import pkgutil import re import sys from scattertext.Common import POLITICAL_DATA_URL, ROTTEN_TOMATOES_DATA_URL if sys.version_info[0] >= 3: from urllib.request import urlopen else: from urllib2 import urlopen import pandas as pd class ConventionData2012(object): @staticmethod def _speaker_name_factory(): name_re = re.compile(r'.*(\n|^)(?P<name>[A-Z0-9 \.\']+):\w*.+', re.M) def speaker_name(text): for _, name in name_re.findall(text): if name not in ('ANNOUNCER', 'AUDIENCE MEMBER', 'AUDIENCE MEMBERS'): return name return speaker_name @staticmethod def _clean_function_factory(): only_speaker_text_re = re.compile( r'((^|\n)((ANNOUNCER|AUDIENCE MEMBERS?): .+)($|\n)|(\n|^)((([A-Z\.()\-\' ]+): ))|\(.+\) *)', re.M) assert only_speaker_text_re.sub('', 'AUDIENCE MEMBERS: (Chanting.) USA! USA! USA! USA!') == '' assert only_speaker_text_re.sub('', 'AUDIENCE MEMBER: (Chanting.) USA! USA! USA! USA!') == '' assert only_speaker_text_re.sub('', 'ANNOUNCER: (Chanting.) USA! USA! USA! USA!') == '' assert only_speaker_text_re.sub('', 'TOM SMITH: (Chanting.) USA! USA! USA! USA!') == 'USA! USA! USA! USA!' assert only_speaker_text_re.sub('', 'DONALD TRUMP: blah blah blah!') == 'blah blah blah!' assert only_speaker_text_re.sub('', 'HILLARY CLINTON: (something parenthetical) blah blah blah!') == 'blah blah blah!' assert only_speaker_text_re.sub \ ('', 'ANNOUNCER: (Chanting.) USA! USA! USA! USA!\nTOM SMITH: (Chanting.) ONLY INCLUDE THIS! ONLY KEEP THIS! \nAUDIENCE MEMBER: (Chanting.) USA! USA! USA! USA!').strip() \ == 'ONLY INCLUDE THIS! ONLY KEEP THIS!' def clean_document(text): return only_speaker_text_re.sub('', text) return clean_document @staticmethod def _convention_speech_iter(): try: data_stream = pkgutil.get_data('scattertext', 'data/political_data.json').decode('utf-8') except: url = POLITICAL_DATA_URL data_stream = urlopen(url).read().decode('utf-8') return json.loads(data_stream) @staticmethod def _iter_party_speech_pairs(): for speaker_obj in ConventionData2012._convention_speech_iter(): political_party = speaker_obj['name'] for speech in speaker_obj['speeches']: yield political_party, speech @staticmethod def get_data(): clean = ConventionData2012._clean_function_factory() get_speaker_name = ConventionData2012._speaker_name_factory() data = [] for party, speech in ConventionData2012._iter_party_speech_pairs(): cleaned_speech = clean(speech) speaker_name = get_speaker_name(speech) if cleaned_speech and cleaned_speech != '' and speaker_name != '': data.append({'party': party, 'text': cleaned_speech, 'speaker': speaker_name}) return pd.DataFrame(data) class RottenTomatoes(object): ''' Derived from the sentiment polarity/subjectivity datasets from http://www.cs.cornell.edu/people/pabo/movie-review-data/ Bo Pang and Lillian Lee. ``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts'', Proceedings of the ACL, 2004. ''' @staticmethod def get_data(): ''' Returns ------- pd.DataFrame I.e., >>> convention_df.iloc[0] category plot filename subjectivity_html/obj/2002/Abandon.html text A senior at an elite college (Katie Holmes), a... movie_name abandon ''' try: data_stream = pkgutil.get_data('scattertext', 'data/rotten_tomatoes_corpus.csv.bz2') except: url = ROTTEN_TOMATOES_DATA_URL data_stream = urlopen(url).read() return pd.read_csv(io.BytesIO(bz2.decompress(data_stream))) @staticmethod def get_full_data(): ''' Returns all plots and reviews, not just the ones that appear in movies with both plot descriptions and reviews. Returns ------- pd.DataFrame I.e., >>> convention_df.iloc[0] category plot text Vijay Singh Rajput (Amitabh Bachchan) is a qui... movie_name aankhen has_plot_and_reviews False Name: 0, dtype: object ''' try: data_stream = pkgutil.get_data('scattertext', 'data/rotten_tomatoes_corpus_full.csv.bz2') except: url = ROTTEN_TOMATOES_DATA_URL data_stream = urlopen(url).read() return pd.read_csv(io.BytesIO(bz2.decompress(data_stream)))