python source code of adblock

import logging
import datetime
import os
import urllib
import shutil
import time

# base class we inherit from and extend
import browser_unit

# imports to use selenium
import selenium
from selenium import webdriver
from xvfbwrapper import Xvfb    # artificial display for headless experiments

# imports to parse easylist
from adblockparser import AdblockRules
from adblockparser import AdblockRule

# imports to parse url
from urlparse import urlparse, parse_qs

# imports to log ad data
import json
from collections import namedtuple

class AdBlockUnit(browser_unit.BrowserUnit):

    EASYLIST = 'easylist.txt'
    EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt"

    def _easylist_version(self,path=EASYLIST):
        '''
        Reads the version from the current easylist, or a file that is passed in
        '''
        if os.path.isfile(path):
            with open(path) as f:
                lines = f.read().splitlines()
                return lines[2].split(':')[1].strip()
        else:
            return -1

    def _fetch_easylist(self):
        '''
        Downloads the latest version of easylist, and if newer replaces any
        existing one.
        '''
        tmp_easylist = "tmp_"+self.EASYLIST
        cur_version = self._easylist_version()

        # download latest easylist from the Internet
        urllib.urlretrieve(self.EASYLIST_URL,tmp_easylist)
        tmp_version = self._easylist_version(path=tmp_easylist)
        
        # if necessary update
        if tmp_version > cur_version and cur_version != -1:
            os.remove(self.EASYLIST)
            shutil.move(tmp_easylist,self.EASYLIST)
            print ("Updated easylist from {} to {}".format(cur_version,tmp_version))
        elif cur_version == -1:
            shutil.move(tmp_easylist,self.EASYLIST)
            print("New easylist {}".format(tmp_version))
        else:
            os.remove(tmp_easylist)
            print("Easylist already up to date at: {}".format(tmp_version))

    def _load_easylist(self):
        '''
        Reads in easylist from a file and parses it into lines to be passed to
        abblockparser.
        '''
        with open(self.EASYLIST) as f:
            lines = f.read().splitlines()
        print("Loaded easylist version: {} with : {} items".format(self._easylist_version(),len(lines)))
        return lines


    def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None):
        

        # if easylist is not passed in, then consider this is a bare unit that 
        # that should only be used to fetch easylist and then parse into
        # adblockplus rules for use with adblockparser.
        if rules == None:
            self._fetch_easylist()
            self.filterlist = self._load_easylist()
            self.rules = AdblockRules(self.filterlist)
        else:
            logging.basicConfig(filename="adb_"+log_file,level=logging.INFO)
            self.logger = logging.getLogger(__name__)

            # call parent constructor
            browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy)

            self.session = self.driver.session_id
            print("Running adblock unit session: {}".format(self.session))
            
            # set rules to those that where passed in
            self.rules = rules
            self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS}

            # internal ad data structure 
            self.data = []

            self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads'])

            # dictionary to memoize url checks
            self.memo = {}

            # store current context where we are collecting ads
            self.site = ""
            self.reloads= 0

    def save_data(self):
        json_file = os.path.splitext(self.log_file)[0]+"."+self.session+".json"
        with open(json_file, 'w') as outfile:
            json.dump(self.data, outfile)

        # This is the log line adblock_analysis will parse to identify data files
        self.logger.info("save_data:{}:{}:{}".format(self.unit_id,self.treatment_id,self.session))

    def log_element(self,element,source):
        '''
        Input: An element that has been identified as an ad and how it was identified
        Result: Inserts appropriate information into the log
        '''
    

    
        url = element.get_attribute(source)
        html = element.get_attribute('outerHTML').encode('utf-8')
        tag = element.tag_name
        link_text = element.text
        link_location = element.location
         
        # update internal datastore
        ad_data = self.Ad(url=url, outerhtml=html, tag=tag, link_text=link_text, link_location=link_location, on_site=self.site, reloads=self.reloads)
        
        # store to internal data structure
        self.data.append(ad_data)

        # log to plaintext log
        self.logger.debug("Ad:Data:{}".format(ad_data))

    def check_elements(self, elements, source, options=None):
        '''
        Input: Given an element in the currently active page and an attribute to query on
        Result: Queries the given attribute (source) and checks the url against the 
        filterlist. Logs any identified elements and returns the count.
        '''
        count = 0
        for e in elements:
            try:
                url = e.get_attribute(source)
                if url != None:
                    self.logger.debug("Checking:{}:{}".format(source, url))
                    # check if we have evaluated this ad before
                    if url not in self.memo:
                        # actually check the url against the filter list
                        self.memo[url] = self.rules.should_block(url, options)

                    if self.memo[url]:
                        self.log_element(e,source)
                        count+=1

            # occurs with stale elements that no longer exist in the DOM
            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)
        return count


    def check_href(self):
        '''
        Identifies and captures ads based on HTML hyperlink tags.
        These are considered "text" ads.
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <a>,<link>
        elements = driver.find_elements_by_xpath("//*[@href]")
        count = self.check_elements(elements,"href", self.all_options)
        self.logger.debug("href search found: {}".format(count))
    

    def check_src(self):
        '''
        Identifies and captures ads based on tags with a 'src' attribute
        These are considered "media" ads and are often img, iframe,script
        tags
        '''
        driver = self.driver
        ### xpath could be less performant than other find_* methods
        # common tags: <img>, <iframe>, <frame>, <embed>, <script>
        elements = driver.find_elements_by_xpath("//*[@src]")
        count = self.check_elements(elements, "src", self.all_options)
        self.logger.debug("src search found: {}".format(count))


    def check_iframe(self,parents=()):
        '''
        Functionality to check within nested iframes for ad related resources.
        Invariants: expects webdriver to enter at the level defined by parents
        resets webdriver to top level contents prior to leaving
        Input: a tuple describing the iframe name attribute of parent levels
        '''

        driver = self.driver
        children = driver.find_elements_by_tag_name('iframe')

        for child in children:

            try:
                driver.switch_to.frame(child)

                # check in the iframe for ads
                self.check_href()
                self.check_src()

                # set parent for children we check
                nesting = parents + (child,)
                self.check_iframe(parents=nesting)

            except selenium.common.exceptions.StaleElementReferenceException as e:
                self.logger.error(e)

            # return to correct level of nesting
            driver.switch_to_default_content()

            for p in parents:
                try:
                    driver.switch_to.frame(p)
                except selenium.common.exceptions.NoSuchElementException as e:
                    # this should not occur but just in case, preserve invariant
                    # of function leaving at top level
                    self.logger.error("resetting level in iframe recursion")
                    driver.switch_to_default_content()


        # always reset to top level content prior to exiting
        driver.switch_to_default_content()

    def find_ads(self):
        '''
        Primary convenience function to use all ad identification mechanisms
        '''
        self.check_href()
        self.check_src()
        self.check_iframe()

    def visit_url(self,url):
        driver = self.driver
        try:
            driver.get(url)
            self.logger.debug("Visited: {}".format(url))
            self.site = url
            return True
        except selenium.common.exceptions.TimeoutException as e:
            print("Timeout Visiting: {} : {}".format(url,self.session))
            print e
            return False


    def collect_ads(self,url, reloads=1, delay=0, file_name=None):
        '''
        Visits a specified url and runs ad collection functions
        Result: 
        '''
        print("collecting ads on: {}".format(url))
        if file_name == None:
            file_name = self.log_file

        # number of reloads on site to capture all ads
        for r in range(reloads):
            time.sleep(delay)

            # if a successful visit
            if self.visit_url(url):
                # collect ads
                self.reloads=r
                self.find_ads()