import logging import matplotlib import matplotlib.pyplot as plt from PIL import Image import re import requests import StringIO import app_config from plugins.base import CarebotPlugin from util.analytics import GoogleAnalytics from util.chart import ChartTools from util.models import Story from util.s3 import Uploader s3 = Uploader() logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) class JqueryScrolldepth(CarebotPlugin): """ Get scroll depth stats on Jquery Scroll Depth Stories TODO: refactor to get working with their stats """ SLUG_SEARCH_REGEX = re.compile(ur'slug ((\w*-*)+)') def get_listeners(self): """ Associate regular expression matches to the appropriate handler """ return [ ['depth', self.SLUG_SEARCH_REGEX, self.handle_slug_inquiry], # ['linger-url', self.GRUBER_URLINTEXT_PAT, self.handle_url_inquiry], ] """ TODO @staticmethod def sort_scroll_depth(data): """ A helper function for sorting the results generated by the Scroll Depth library. Source: http://stackoverflow.com/a/12814719 """ source = ('Baseline', '25%', '50%', '75%', '100%',) return sorted(data, key=lambda x: source.index(x[0])) def process_scroll_depth_data(self, data): rows = [] for row in data: row[0] = row[0] # Percent depth on page row[1] = int(row[1]) # Total users row[2] = int(row[2]) # Total number of users that reached depth rows.append(row) # Sorts the data returned by Scroll Depth rows = GoogleAnalyticsScraper.sort_scroll_depth(rows) # Calculate the percentage of users total_engaged = rows[0][1] # 100% of the users see bucket 1. for row in rows: pct = round((row[1] / total_engaged) * 100) row.append(int(pct)) truncated = rows[:10] return truncated """ def get_slug_query_params(self, team, slug=None): """ Given a slug, get parameters needed to query google analytics for the scroll depth based on http://scrolldepth.parsnip.io/ event data """ filters = [ 'ga:pagePath=={}'.format(slug), 'ga:eventCategory==Scroll Depth', 'ga:eventAction==Percentage', ] filters = ';'.join(filters) params = { 'ids': 'ga:{0}'.format(app_config.GA_ORGANIZATION_ID), 'start-date': '90daysAgo', # start_date.strftime('%Y-%m-%d'), 'end-date': 'today', 'metrics': 'ga:users,ga:eventValue', 'dimensions': 'ga:eventLabel', 'filters': filters, 'max-results': app_config.GA_RESULT_SIZE, 'samplingLevel': app_config.GA_SAMPLING_LEVEL, 'start-index': 1, } return params def fill_in_max(self, data): """ Sometime people start at 20, 30, 40% of the article read because their screens are lare or the article is short. fill_in_max finds the starting bucket with the largest number of people and fills in all previous buckets with that count.that That way we get an accurate count of how many people read the top of the article. """ max_people = max(data, key=lambda item:item[1])[1] for row in data: if row[1] == max_people: break row[1] = max_people # Calculate the percentage of users for row in data: pct = round((row[1] / float(max_people)) * 100) row.append(int(pct)) return data def clean_data(self, data): """ Fix data types, truncate the data, and otherwise make it fit for consumption. """ rows = [] for row in data: row[0] = int(row[0]) # Percent depth on page row[1] = int(row[1]) # Total users row[2] = int(row[2]) # Seconds on page rows.append(row) # Sort the row data from 10% => 100% rows.sort(key=lambda tup: tup[0]) rows = self.fill_in_max(rows) # Only take the first 10 rows. truncated = rows[:10] return truncated def get_median(self, data): """ Take the scroll depth data we have (number of people per percent) Then calculate how many people only got to THAT bucket (aka didn't get to the next percent bucket) """ length = len(data) for i, row in enumerate(data): if not i == length - 1: row[1] = row[1] - data[i + 1][1] lst = [] # Flatten the [percent, count] tuples # This is a really inefficient way to do this! for bucket in data: for _ in range(bucket[1]): lst.append(bucket[0]) median = GoogleAnalytics.median(lst) return int(median) def get_total_people(self, data): """ Find the tuple with the max number of people. """ return max(data, key=lambda item:item[1])[1] def get_chart(self, rows, median=None, labels=['100%', '90%', '80%', '70%', '60%', '50%', '40%', '30%', '20%', '10%']): """ Create a scroll depth histogram """ r = range(1, len(rows) + 1) data = [] # Rows are drawn "upside down" so we need to reverse them: rows.reverse() for row in rows: data.append(row[3]) # Set the chart size plt.figure(figsize=(2,4), dpi=100) # Remove the plot frame lines. They are unnecessary chartjunk. ax = plt.subplot(1, 1, 1) ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) # Ensure that the axis ticks only show up on the bottom and left of the plot. # Ticks on the right and top of the plot are generally unnecessary chartjunk. ax.get_xaxis().tick_bottom() ax.get_yaxis().tick_left() # Configure x-axis ticks plt.xlim(0, 100) ax.tick_params(axis='x', colors='#b8b8b8', labelsize=8, labelbottom='off') plt.axes().xaxis.set_ticks_position('none') # Configure y-axis ticks plt.axes().yaxis.set_ticks_position('none') ax.tick_params(axis='y', colors='#b8b8b8', labelsize=7) ax.yaxis.label.set_fontsize(10) plt.yticks(r, labels) chart = plt.barh(r, data, align="center") # TODO: Set colors in one sweep # Broken out for now in case we want to play with hihlighting segments for index, value in enumerate(data): chart[index].set_color('#4b7ef0') # TODO: Median line # for bar in chart: # width = bar.get_width() # print width # print bar.get_y() # if bar.get_y() == 1.6: # print # ax.text( # bar.get_y() + bar.get_height()/2., # 1.05 * width, # "MED", # ha='center', # va='bottom', # color='#b8b8b8', # fontsize=8 # ) plt.savefig('tmp.png', bbox_inches='tight') f = open('tmp.png', 'rb') url = s3.upload(f) return url def get_slug_message(self, slug, story=None): # Try to match the story to a slug to accurately get a team # The Google Analytics property ID comes from the team config # We use the default team if none is found stories = Story.select().where(Story.slug.contains(slug)) team = self.config.get_team_for_stories(stories) params = self.get_slug_query_params(team=team, slug=slug) data = GoogleAnalytics.query_ga(params) if not data.get('rows'): logger.info('No rows found for slug %s' % slug) return # Clean up the data clean_data = self.clean_data(data.get('rows')) total_people = self.get_total_people(clean_data) friendly_people = "{:,}".format(total_people) # Comma-separated #s median = self.get_median(clean_data) # Set up the chart scroll_histogram_url = self.get_chart(clean_data) if story: scroll_histogram_url = ChartTools.add_screenshot_to_chart(story, scroll_histogram_url) # TODO: Not confident in median calculations so far # text = "*%s people* got a median of *%s percent* down the page." % (friendly_people, median) text = '' attachments = [{ "fallback": slug + " update", "color": "#eeeeee", "title": "How far down did people scroll?", "image_url": scroll_histogram_url }] return { 'text': text, 'attachments': attachments } def handle_slug_inquiry(self, message): """ Respond to an inquiry about the slug with stats and charts """ match = re.search(self.SLUG_SEARCH_REGEX, message.body['text']) slug = match.group(1) if slug: return self.get_slug_message(slug) def get_update_message(self, story): , data, time_bucket channel = story.channel() message = ("%s hours in and here's what I know about the graphics on _%s_:") % ( self.hours_since(story.date), story.name, ) # remove `Baseline` from data data = data[1:] histogram_url = ChartTools.scroll_histogram_link(data, labels=( '100%', '75%', '50%', '25%',)) if story.screenshot: histogram_url = ChartTools.add_screenshot_to_chart( story.screenshot, histogram_url) fields = [] for row in data: fields.append({ "title": row[0], "value": row[2], "short": True, }) attachments = [ { "fallback": story.name + " update", "color": "#eeeeee", "title": story.name, "title_link": story.url, "fields": fields, "image_url": histogram_url, } ] self.slack.chat.post_message( channel, message, as_user=True, parse='full', attachments=attachments)