import praw import inspect, os, sys # set the BASE_DIR import simplejson as json import datetime, yaml, time, csv, pytz import reddit.connection import reddit.praw_utils as praw_utils import reddit.queries import sqlalchemy from collections import defaultdict from dateutil import parser from utils.common import * from app.models import Base, SubredditPage, Subreddit, Post, ModAction, PrawKey, Comment from app.models import Experiment, ExperimentThing, ExperimentAction, ExperimentThingSnapshot from app.models import EventHook from sqlalchemy import and_, or_, desc, asc from app.controllers.subreddit_controller import SubredditPageController ### LOAD ENVIRONMENT VARIABLES BASE_DIR = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))), "..","..") ENV = os.environ['CS_ENV'] class StylesheetExperimentController: def __init__(self, experiment_name, db_session, r, log, required_keys = ['subreddit', 'subreddit_id', 'username', 'start_time', 'end_time', 'conditions', 'intervention_interval_seconds', 'intervention_window_seconds', 'first_n_comments','comment_snapshot_period_seconds']): self.db_session = db_session self.log = log self.r = r self.load_experiment_config(required_keys, experiment_name) def get_experiment_config(self, required_keys, experiment_name): experiment_file_path = os.path.join(BASE_DIR, "config", "experiments", experiment_name) + ".yml" with open(experiment_file_path, 'r') as f: try: experiment_config_all = yaml.full_load(f) except yaml.YAMLError as exc: self.log.error("{0}: Failure loading experiment yaml {1}".format( self.__class__.__name__, experiment_file_path), str(exc)) sys.exit(1) if(ENV not in experiment_config_all.keys()): self.log.error("{0}: Cannot find experiment settings for {1} in {2}".format( self.__class__.__name__, ENV, experiment_file_path)) sys.exit(1) experiment_config = experiment_config_all[ENV] for key in required_keys: if key not in experiment_config.keys(): self.log.error("{0}: Value missing from {1}: {2}".format( self.__class__.__name__, experiment_file_path, key)) sys.exit(1) return experiment_config def load_experiment_config(self, required_keys, experiment_name): experiment_config = self.get_experiment_config(required_keys, experiment_name) experiment = self.db_session.query(Experiment).filter(Experiment.name == experiment_name).first() if(experiment is None): condition_keys = [] ## LOAD RANDOMIZED CONDITIONS (see CivilServant-Analysis) for condition in experiment_config['conditions'].values(): with open(os.path.join(BASE_DIR, "config", "experiments", condition['randomizations']), "r") as f: reader = csv.DictReader(f) randomizations = [] for row in reader: randomizations.append(row) condition['randomizations'] = randomizations experiment = Experiment( name = experiment_name, controller = self.__class__.__name__, start_time = parser.parse(experiment_config['start_time']), end_time = parser.parse(experiment_config['end_time']), settings_json = json.dumps(experiment_config) ) self.db_session.add(experiment) self.db_session.commit() ### SET UP INSTANCE PROPERTIES self.experiment = experiment self.experiment_settings = json.loads(self.experiment.settings_json) self.experiment_name = experiment_name self.subreddit = experiment_config['subreddit'] self.subreddit_id = experiment_config['subreddit_id'] self.username = experiment_config['username'] def update_experiment(self): ## DETERMINE ELIGIBILITY if(self.determine_intervention_eligible()): condname = self.select_condition() self.run_intervention(condname) def determine_intervention_eligible(self): current_time = datetime.datetime.utcnow().replace(tzinfo=pytz.utc) start_time = parser.parse(self.experiment_settings['start_time']) end_time = parser.parse(self.experiment_settings['end_time']) eligible = ((current_time > start_time) and (current_time < end_time)) last_experiment_created = None if(eligible): last_experiment_action = self.db_session.query(ExperimentAction).filter( ExperimentAction.experiment_id==self.experiment.id).order_by(desc(ExperimentAction.created_at)).first() ## keep eligible True if there's no previous action ## if there is a previous action, check that it falls within ## the interval if(last_experiment_action): last_experiment_created = last_experiment_action.created_at.replace(tzinfo=pytz.utc) interval_since_last_action = (current_time - last_experiment_created).total_seconds() eligible = ((interval_since_last_action < self.experiment_settings['intervention_interval_seconds'] + self.experiment_settings['intervention_window_seconds']) and (interval_since_last_action >= self.experiment_settings['intervention_interval_seconds'] - self.experiment_settings['intervention_window_seconds'])) ## if more than 36 hours have elapsed, then we're eligible if(interval_since_last_action>129600): eligible = True if(eligible==False): self.log.info("{0}: Experiment {1} code run. Ineligible to Continue. Begin Time: {2}. End Time: {3}. Last Intervention: {4} ".format( self.__class__.__name__, self.experiment.name, start_time, end_time, last_experiment_created )) return eligible ### TWO CONDITIONS: NORMAL AND SPECIAL ### MONDAYS FRIDAYS SATURDAYS def select_condition(self, current_time=datetime.datetime.utcnow()): isoweekdays = {7:"Sunday",1:"Monday", 2:"Tuesday", 3:"Wednesday", 4:"Thursday", 5:"Friday", 6:"Saturday"} isoweekday = isoweekdays[current_time.date().isoweekday()] ## weekdays in the configuration file conditions should be exclusive for condition_name, values in self.experiment_settings['conditions'].items(): if isoweekday in values['days'].split(","): return condition_name return None def run_intervention(self, condname): cond = self.experiment_settings['conditions'][condname] no_randomizations_remain = False try: randomization = cond['randomizations'][cond['next_randomization']] arm = "arm_" + randomization['treatment'] self.experiment_settings['conditions'][condname]['next_randomization'] += 1 except: #import pdb;pdb.set_trace self.log.error("{0}: Experiment {1} condition {2} has used its full stock of {3} {4} conditions. Cannot assign any further.".format( self.__class__.__name__, self.experiment.name, condname, len(cond['randomizations']), arm )) no_randomizations_remain = True if(no_randomizations_remain): return False self.experiment.settings_json = json.dumps(self.experiment_settings) self.db_session.commit() self.log.info("{0}: Experiment {1}: assigned condition {2} arm {3}".format( self.__class__.__name__, self.experiment.name, condname, arm)) intervene = getattr(self, "intervene_" + condname + "_" + arm) return intervene(condname) def set_stylesheet(self, condition, arm): arms = self.experiment_settings['conditions'][condition]['arms'] intervention_line = arms[arm] found_code = False stylesheet_data = self.r.get_stylesheet(self.subreddit) if "stylesheet" in stylesheet_data.keys(): line_list = [] for line in stylesheet_data['stylesheet'].split("\n"): ## IF A LINE FROM THE STUDY IS FOUND, ## REPLACE IT WITH THE INTERVENTION if line in arms.values(): line = intervention_line found_code = True line_list.append(line) ## IF THE CODE IS NOT FOUND, ADD IT TO THE END if(found_code!=True): line_list.append("/* CivilServantBot Experiment CSS */") line_list.append(intervention_line) line_list.append("") new_stylesheet = "\n".join(line_list) result = self.r.set_stylesheet(self.subreddit, new_stylesheet) if('errors' in result.keys() and len(result['errors'])==0): self.log.info("{0}: Experiment {1}: Applied Arm {3} of Condition {4} in {2}".format( self.__class__.__name__, self.experiment.id, self.subreddit, arm, condition)) experiment_action = ExperimentAction( experiment_id = self.experiment.id, praw_key_id = PrawKey.get_praw_id(ENV, self.experiment_name), action = "Intervention", action_object_type = ThingType.STYLESHEET.value, action_object_id = None, metadata_json = json.dumps({"arm":arm, "condition":condition}) ) self.db_session.add(experiment_action) self.db_session.commit() else: self.log.error("{0}: Experiment {1}: Failed to apply Arm {2} of Condition {3}. Reddit errors: {4}".format( self.__class__.__name__, self.experiment.id, self.subreddit, arm,condition, ", ".join(result['errors']))) # experiment_action = ExperimentAction( # experiment_id = self.experiment.id, # praw_key_id = PrawKey.get_praw_id(ENV, self.experiment_name), # action = "NonIntervention:PrawError.{0}.{1}".format(condition,arm), # action_object_type = ThingType.STYLESHEET.value, # action_object_id = None # ) # self.db_session.add(experiment_action) # self.db_session.commit() ## IF WE FAILED TO APPLY THE INTERVENTION, ROLL BACK THAT RANDOMIZATION self.experiment_settings['conditions'][condname]['next_randomization'] -= 1 self.experiment.settings_json = json.dumps(self.experiment_settings) self.db_session.commit() ## TO HELP WITH TESTING, RETURN THE FULL TEXT OF THE STYLESHEET return new_stylesheet def intervene_normal_arm_0(self, condname): return self.set_stylesheet(condname, "arm_0") def intervene_normal_arm_1(self, condname): return self.set_stylesheet(condname, "arm_1") def intervene_special_arm_0(self, condname): return self.set_stylesheet(condname, "arm_0") def intervene_special_arm_1(self, condname): return self.set_stylesheet(condname, "arm_1") ###################### ## COMMENT SNAPSHOTS ###################### ## THIS HIGH LEVEL METHOD TAKES A SNAPSHOT OF COMMENTS THAT NEED SAMPLING ## All def archive_experiment_submission_metadata(self): posts = self.identify_posts_that_need_snapshotting() comments = self.sample_comments(posts) self.observe_comment_snapshots(comments) ## IDENTIFY POSTS AND ALSO CREATE AN EXPERIMENT_THING ## FOR POSTS THAT DON'T YET HAVE ONE def identify_posts_that_need_snapshotting(self): last_action = self.db_session.query(ExperimentAction).filter( ExperimentAction.experiment_id == self.experiment.id, ExperimentAction.action=="Intervention").order_by( ExperimentAction.created_at ).first() if(last_action is None): return [] eligible_posts = [] for post in self.db_session.query(Post).filter( Post.created_at >= last_action.created_at, Post.subreddit_id == self.experiment_settings['subreddit_id']).all(): eligible_posts.append(post) # find posts that are unpaired added_experiment_things = 0 ## in th future, # use the post prefix for ExperimentThing indices because # the ExperimentThing table includes other object types # and there might be collisions #post_prefix = "t3_" for post in self.db_session.query(Post).outerjoin( ExperimentThing, Post.id == ExperimentThing.id).filter( ExperimentThing.id==None, Post.id.in_([x.id for x in eligible_posts])).all(): et = ExperimentThing( id = post.id, object_type = ThingType.SUBMISSION.value, experiment_id = self.experiment.id, object_created = post.created, metadata_json = last_action.metadata_json ) self.db_session.add(et) added_experiment_things += 1 self.db_session.commit() self.log.info("{0}: Experiment {1}: Added {2} posts for comment monitoring in r/{3}.".format( self.__class__.__name__, self.experiment.id, added_experiment_things, self.subreddit)) return eligible_posts ## THIS METHOD CHOOSES COMMENTS TO SAMPLE ## COMMENTS ARE SELECTED IF THEY'RE TOPLEVEL COMMENTS ## AND ADDS THEM AS EXPERIMENT_THINGS FOR LATER SNAPSHOTTING def sample_comments(self, posts): #posts = self.identify_posts_that_need_snapshotting() comment_things_to_observe = [] comments_to_observe = [] # STEP ONE: FOR EACH POST, # FIND OUT HOW MANY EXPERIMENT_THINGS ARE ASSOCIATED WITH THAT POST posts_needing_comments = defaultdict(list) comment_thing_counts = [] for post in posts: comment_things = list(self.db_session.query(ExperimentThing).filter( ExperimentThing.query_index == post.id, ExperimentThing.experiment_id == self.experiment.id).all()) comment_things_to_observe = comment_things_to_observe + comment_things comment_thing_counts.append(len(comment_things)) if(len(comment_things) < self.experiment_settings['first_n_comments']): posts_needing_comments[post.id] = comment_things # this shouldn't be more than tens of thousands of comments # for an experiment that randomizes on a day basis. # In other experiments, it might be important to query on a per-post basis post_comments = defaultdict(list) added_n_comments_for_monitoring = 0 for comment in self.db_session.query(Comment).filter(Comment.post_id.in_([x for x in posts_needing_comments.keys()])).order_by(asc(Comment.created_utc)): post_comments[comment.post_id].append(comment) for post_id, comments in post_comments.items(): already_observing = [x.id for x in posts_needing_comments[post_id]] post_comment_count = len(already_observing) for comment in comments: # if the comment is toplevel and hasn't been seen before # and we're under our quota, then add an experiment_thing. # snapshots will be taken in a separate method if(post_comment_count < self.experiment_settings['first_n_comments'] and comment.post_id == comment.post_id and comment.id not in already_observing): comments_to_observe.append(comment) et = ExperimentThing( id = comment.id, object_type = ThingType.COMMENT.value, experiment_id = self.experiment.id, object_created = comment.created_utc, query_index = post_id ) self.db_session.add(et) post_comment_count += 1 added_n_comments_for_monitoring += 1 self.log.info("{0}: Experiment {1}: Added {2} comments for monitoring in r/{3}".format( self.__class__.__name__, self.experiment.id, added_n_comments_for_monitoring, self.subreddit)) self.db_session.commit() ## now fetch the remaining comments if(len(comment_things_to_observe)>0): comments_to_observe = comments_to_observe + list(self.db_session.query(Comment).filter(Comment.id.in_([x.id for x in comment_things_to_observe])).all()) return comments_to_observe ## THIS METHOD OBSERVES POSTS IN THE EXPERIMENT PERIOD THAT DON'T HAVE ## AN EXPERIMENT THING RECORD OR HAVE ONE BUT NOT AN EXPERIMENT ACTION. IT THEN: ## - ASSIGNS THEM AN EXPERIMENT THNG ## - FOR POSTS WITH EXPERIMENT THINGS, WHICH HAVEN'T HAD ## A CommentSampleComplete EXPERIMENT_ACTION ## - RUN THE observe_first_comments method ## - IF THE FIRST N COMMENTS ARE FOUND, ADD A ## CommentSampleComplete EXPERIMENT_ACTION def observe_comment_snapshots(self, comments_to_observe): current_time = datetime.datetime.utcnow() intervention_window = self.experiment_settings['comment_snapshot_period_seconds'] eligible_comment_ids = [x.id for x in comments_to_observe if (current_time - x.created_utc).total_seconds() < intervention_window] #comment_things = self.db_session.query(ExperimentThing).filter(ExperimentThing.id.in_(eligible_comment_ids)).all() reddit_comment_ids = ["t1_" + x for x in eligible_comment_ids] if(len(reddit_comment_ids) == 0): self.log.info("{0}: Experiment {1}: Collected Snapshots from 0 comments in r/{3}.".format( self.__class__.__name__, self.experiment.id, len(reddit_comment_ids), self.subreddit)) return for comment in self.r.get_info(thing_id = reddit_comment_ids): snapshot = {"score":comment.score, "num_reports":comment.num_reports, "user_reports":len(comment.user_reports), "ups":comment.ups, "downs":comment.downs, "mod_reports":len(comment.mod_reports) } experiment_thing_snapshot = ExperimentThingSnapshot( experiment_thing_id = comment.id, object_type = ThingType.COMMENT.value, experiment_id = self.experiment.id, metadata_json = json.dumps(snapshot) ) self.db_session.add(experiment_thing_snapshot) self.db_session.commit() self.log.info("{0}: Experiment {1}: Collected Snapshots from {2} comments in r/{3}.".format( self.__class__.__name__, self.experiment.id, len(reddit_comment_ids), self.subreddit))