python source code of load

import os
import sys
import time
import urllib2
import base64
import json
import ssl
import zlib
import threading

from threading import Lock
from httplib import *
from config import Config
from apiclient.errors import *
import logging.config
import tweepy

from utils import Utils


NEWLINE = '\r\n'
SLEEP_TIME = 10

f = file("./config")
config = Config(f)

class GnipListener(object):

    CHUNK_SIZE = 4 * 1024
    KEEP_ALIVE = 30  # seconds

    HEADERS = { 'Accept': 'application/json',
                'Connection': 'Keep-Alive',
                'Accept-Encoding' : 'gzip',
                'Authorization' : 'Basic %s' % base64.encodestring('%s:%s' % (config.GNIP_STREAM_USERNAME, config.GNIP_STREAM_PASSWORD))  }

    """docstring for ClassName"""
    def __init__(self, schema, table_mapping, logger=None):

      self.schema = schema
      self.table_mapping = table_mapping
      self.default_table = table_mapping.values()[0]
      self.count = 0
      self.logger = logger

    def on_data(self, data):
        # get bulk records, but process individually based on tag-based routing
        records_str = data.strip().split(NEWLINE)
        for r in records_str:
            record = json.loads(r)
            if not record.get('delete', None):
                tags = self.get_table_tags(record)

                if not tags:
                    tags = [self.default_table]
                # process multiple tags on a record
                for tag in tags:
                    table = None
                    if not tag:
                        table = self.default_table
                    else:
                        table = self.table_mapping.get(tag, None)
                        if not table:
                            table = tag.split(".")
                            created = Utils.insert_table(table[0], table[1]) #, self.schema)
                            
                            # Brand new table 
                            if created and created != True:
                                self.logger.info('Created BQ table: %s' % tag)
                                
                            self.table_mapping[tag] = table

                    record_scrubbed = Utils.scrub(record)
                    Utils.insert_records(table[0], table[1], [record_scrubbed])

                if self.logger:
                    self.logger.info('@%s: %s (%s)' % (record['actor']['preferredUsername'], record['body'].encode('ascii', 'ignore'), tags))

                self.count = self.count + 1

        return True

    def get_table_tags(self, record):
        gnip = record.get('gnip', None)
        if gnip:
            matching_rules = gnip.get('matching_rules', None)
            if matching_rules:
                return [rule.get("tag", None) for rule in matching_rules]

        return None

    @staticmethod
    def start(schema, logger):

        # initialize table mapping for default table
        # BUGBUG: initialize based on query to prod
        table_mapping = {
             config.DATASET_ID + "." + config.TABLE_ID : [config.DATASET_ID, config.TABLE_ID]
         }

        datasets = Utils.get_bq().datasets().list(projectId=config.PROJECT_ID).execute()
        datasets = datasets.get("datasets", None)

        for d in datasets:
            ref = d.get("datasetReference", None)
            bq_tables = Utils.get_bq().tables().list(projectId=ref.get("projectId"), datasetId=ref.get("datasetId")).execute()
            if bq_tables['totalItems'] > 0:
                for t in bq_tables.get("tables", None):
                    ref = t.get("tableReference", None)
                    dataset_id = ref.get("datasetId", None)
                    table_id = ref.get("tableId", None)
                    key = Utils.make_tag(dataset_id, table_id)
                    table_mapping[key] = [dataset_id, table_id]

        print("Initialized tables: %s" % table_mapping)

        listener = GnipListener(schema, table_mapping, logger=logger)

        while True:

            stream = None

            try:
                # clean gnip headers
                _headers = GnipListener.HEADERS
                headers = {}
                for k, v in _headers.items():
                    headers[k] = v.strip()

                #req = urllib2.Request(config.GNIP_STREAM_URL, headers=GnipListener.HEADERS)
                req = urllib2.Request(config.GNIP_STREAM_URL, headers=headers)
                response = urllib2.urlopen(req, timeout=(1+GnipListener.KEEP_ALIVE))

                decompressor = zlib.decompressobj(16+zlib.MAX_WBITS)
                remainder = ''
                while True:
                    tmp = decompressor.decompress(response.read(GnipListener.CHUNK_SIZE))
                    if tmp == '':
                        return
                    [records, remainder] = ''.join([remainder, tmp]).rsplit(NEWLINE,1)
                    listener.on_data(records)

                get_stream(listener)

            except:

                logger.exception("Unexpected error:");

                if stream:
                    stream.disconnect()

                time.sleep(SLEEP_TIME)

# Write records to BigQuery
class TwitterListener(tweepy.StreamListener):

    # items to track if you're doing a public track call
    _TRACK_ITEMS = [
        '@JetBlue',
        '@southwestair',
        '@AirAsia',
        '@AmericanAir',
        '@flyPAL',
        '@TAMAirlines',
        '@Delta',
        '@virginamerica',
        '@klm',
        '@turkishairlines',
        '@BritishAirways',
        '@usairways',
        '@British_Airways',
        '@westjet',
        '@MAS',
        '@United',
        '@baltiausa',
        '@Lufthansa_DE',
        '@virginatlantic',
        '@virginaustralia',
        '@AlaskaAir',
        '@DeltaAssist',
        '@aircanada',
        '@easyJet',
        '@vueling'
    ]
    TRACK_ITEMS = [
    '#data15',
    '#SelfServeAnalytics',
    '#SelfServeAtTableau',
    '#ServerEmailAlert',
    '#ServerPermission',
    '#ServerToolbox',
    '#ShipTableau',
    '#SlalomDrive',
    '#SmartDriveData',
    '#SolarWindsData',
    '#SquareData',
    '#StatControl',
    '#StateStreetData',
    '#StJosephData',
    '#StoryPointsData',
    '#StubbornCalcs',
    '#SwedishData',
    '#SyscoData',
    '#SyscoSupplyChain',
    '#TabCmD',
    '#Tabjolt',
    '#TableauAcrossDepartments',
    '#TableauAuth',
    '#TableauAWSEC2',
    '#TableauConsultant',
    '#TableauDataGov',
    '#TableauForecast',
    '#TableauGA',
    '#TableauITService',
    '#TableauJedi',
    '#TableauJediCalcs',
    '#TableauLogs',
    '#TableauOLAP',
    '#TableauOnlineAdmin',
    '#TableauSAP',
    '#TableauSFDC',
    '#TableauShoestring',
    '#TableauSith',
    '#TableauSocialData',
    '#TacomaCCData',
    '#TargetData',
    '#TCFData',
    '#TDBankData',
    '#TDEFast',
    '#TDEROI',
    '#TimeInTableau',
    '#ToughData',
    '#TripAdvisorData',
    '#TruecarData',
    '#UnderstandingLOD',
    '#UndertoneData',
    '#UnifundData',
    '#UnlockData',
    '#UpliftData',
    '#UpPerformance',
    '#USDeptLaborData',
    '#UtahStateData',
    '#UTAustinData',
    '#UTElPasoData',
    '#VisualPipeline',
    '#VitamixData',
    '#VizBestPractice',
    '#VizTips',
    '#VSPData',
    '#WalmartData',
    '#WantToAdmin',
    '#WayfairData',
    '#WebDataConnect',
    '#WildData',
    '#YouDidWhat',
    '#ZenAppDesign',
    '#ZenJourney',
    '#ZilliantData',
    '#ZotecData',
    '#TCRumors',
    '#TCCRumors',
    '#Rundata15',
    '#Watchmeviz',
    '#tableauontableau',
    '#data15rumors',
    '#tableaugroups',
    '#datanightout',
    '#data15keynote',
    '#datapluswomen',
    '#tableauzenmaster',
    '#ironviz',
    '#salesdashboards',
    '#3tierdata',
    '#50shadesdata',
    '#AddValueJSAPI',
    '#ADPdata',
    '#advancedcalcs',
    '#advancedlods',
    '#advancedmaps',
    '#advancedrstats',
    '#advtablecalcs',
    '#aljazeeradata',
    '#allstatedata',
    '#amazondata',
    '#AnalyticsAtScale',
    '#AnalyticsPane',
    '#ArbysData',
    '#AxisData',
    '#BasicStats',
    '#BCBSData',
    '#BeyondMarkTypes',
    '#BeyondSparkler',
    '#BIalyticsStories',
    '#BJCData',
    '#BlendingQuestions',
    '#BoeingData',
    '#BoogalooViz',
    '#BostonData',
    '#BostonSciData',
    '#CachingQueries',
    '#tableau',
    '#CalcMethods',
    '#CareerbuilderData',
    '#CarlsonRezidorData',
    '#CarsData',
    '#CartographerTips',
    '#CaterpillarData',
    '#CentsOfData',
    '#CernerData',
    '#CiscoBigDataViz',
    '#CiscoData',
    '#CiscoSupplyChain',
    '#ClevelandClinicData',
    '#ClimateCorpData',
    '#Cloud9Tableau',
    '#ColumbiaData',
    '#ComcastData',
    '#ComicsStorytelling',
    '#ConcurData',
    '#ConEdData',
    '#CoreQuery',
    '#CreativeCalcs',
    '#CreditSuisseData',
    '#CustomAdminViews',
    '#CustomSQL',
    '#DashboardImpossible',
    '#DashboardsMyWay',
    '#DashboardTurbo',
    '#DataToTheCloud',
    '#DataWrangle',
    '#DCPSData',
    '#DearDataTwo',
    '#DeepDiveQueries',
    '#DeloitteData',
    '#DemystifyR',
    '#DenseData',
    '#DePaulData',
    '#DesMoinesData',
    '#DiscoveryStats',
    '#DisneyData',
    '#DoubleDownDataServer',
    '#DrawWithTableau',
    '#DriveAnalytics',
    '#DukeData',
    '#DwollaData',
    '#easyJetData',
    '#eBayData',
    '#EbolaData',
    '#EmbedSFDC',
    '#EmbedTableau',
    '#EMCData',
    '#EnvironicsData',
    '#ExcelWithTableau',
    '#ExelonData',
    '#ExtractAPIPython',
    '#ExtremeParameters',
    '#ExtremeViz',
    '#EYData',
    '#FacebookData',
    '#FloridaDJJData',
    '#FreescaleData',
    '#FresnoStateData',
    '#GamesInTableau',
    '#GetMoreREST',
    '#GlidewellData',
    '#GoogleData',
    '#GrouponData',
    '#GuaranteedRateData',
    '#HadoopItRight',
    '#HandsOnMapping',
    '#HandsOnStats',
    '#HCAData',
    '#HireRockstars',
    '#HomeDepotData',
    '#HootsuiteData',
    '#HotDirtySets',
    '#InteractionsData',
    '#InteractiveParameters',
    '#IntroAPIS',
    '#IntroCalcs',
    '#IntroMapping',
    '#IntroToLOD',
    '#JandJData',
    '#KaiserData',
    '#KantarData',
    '#KatyISDData',
    '#KiewitData',
    '#KKIntlData',
    '#KoboData',
    '#LargestDeployment',
    '#LieWithStats',
    '#LifelineData',
    '#LinguisticData',
    '#LinkedInAnalytics',
    '#LODsOfFun',
    '#LovesSets',
    '#MacysData',
    '#MagicWithMarks',
    '#MapboxFab',
    '#MaximData',
    '#MerkleData',
    '#MichiganData',
    '#MinorityReportUX',
    '#MobileWithTableau',
    '#MtSinaiData',
    '#MylanData',
    '#NetAppData',
    '#NetflixData',
    '#NeuroscienceStorytell',
    '#NextelData',
    '#NotInShowMe',
    '#OIdata',
    '#OldcastleData',
    '#OptimizeLiveQuery',
    '#OrderOfOps',
    '#PaloAltoData',
    '#PiedmontData',
    '#PluralsightData',
    '#PracticalDashboards',
    '#ProgressiveData',
    '#QuestionDrivenViz',
    '#QuickenData',
    '#RDeepDive',
    '#RealtorData',
    '#RenderTableau',
    '#RetailMeNotData',
    '#RosettaData',
    '#RubbermaidData',
    '#RunTheTable',
    ]

    def __init__(self, dataset_id, table_id, logger=None):

      self.dataset_id = dataset_id
      self.table_id = table_id
      self.count = 0
      self.logger = logger
      self.calm_count = 0

    def on_data(self, data):

        self.calm_count = 0

        # Twitter returns data in JSON format - we need to decode it first
        record = json.loads(data)

        if not record.get('delete', None):

            record_scrubbed = Utils.scrub(record)
            Utils.insert_records(self.dataset_id, self.table_id, [record_scrubbed])

            if self.logger:
                self.logger.info('@%s: %s' % (record['user']['screen_name'], record['text'].encode('ascii', 'ignore')))

            self.count = self.count + 1

            return True

    #handle errors without closing stream:
    def on_error(self, status_code):

        if status_code == 420:

            self.backoff('Status 420')
            return True

        if self.logger:
            self.logger.info('Error with status code: %s' % status_code)

        return False

    # got disconnect notice
    def on_disconnect(self, notice):

        self.backoff('Disconnect')
        return False

    def on_timeout(self):

        self.backoff('Timeout')
        return False

    def on_exception(self, exception):

        if self.logger:
            self.logger.exception('Exception')

        return False

    def backoff(self, msg):

        self.calm_count = self.calm_count + 1
        sleep_time = 60 * self.calm_count

        if sleep_time > 320:
            sleep_time = 320

        if self.logger:
            self.logger.info(msg + ", sleeping for %s" % sleep_time)

        time.sleep(60 * self.calm_count)

        return

    @staticmethod
    def start(schema, logger):

        listener = TwitterListener(config.DATASET_ID, config.TABLE_ID, logger=logger)
        auth = tweepy.OAuthHandler(config.CONSUMER_KEY, config.CONSUMER_SECRET)
        auth.set_access_token(config.ACCESS_TOKEN, config.ACCESS_TOKEN_SECRET)

        while True:

            logger.info("Connecting to Twitter stream")

            stream = None

            try:

                stream = tweepy.Stream(auth, listener, headers = {"Accept-Encoding": "deflate, gzip"})

                # Choose stream: filtered or sample
                stream.sample()
                # stream.filter(track=TwitterListener.TRACK_ITEMS)

            except:

                logger.exception("Unexpected error");

                if stream:
                    stream.disconnect()

                time.sleep(60)

def main():

    if config.MODE not in ['gnip', 'twitter']:
        print "Invalid mode: %s" % config.MODE
        exit()

    logger = Utils.enable_logging()
    print "Running in mode: %s" % config.MODE

    schema_file = None
    if config.MODE == 'gnip':
        schema_file = "./schema/schema_gnip.json"
    else:
        schema_file = "./schema/schema_twitter.json"
    schema_str = Utils.read_file(schema_file)
    schema = json.loads(schema_str)

    Utils.insert_table(config.DATASET_ID, config.TABLE_ID, schema)
    print "Default table: %s.%s" % (config.DATASET_ID, config.TABLE_ID)

    if config.MODE == 'gnip':
        GnipListener.start(schema, logger)
    elif config.MODE == 'twitter':
        TwitterListener.start(schema, logger)

if __name__ == "__main__":
    main()