scala source code of TwitterBatchTimely

Mastering-Spark-for-Data-Science-master
- Chapter08
  - recommender-web
    - pom.xml
    - public
      - javascripts
        bootstrap.min.js
        jquery.min.js
        main.js
      - fonts
        glyphicons-halflings-regular.woff
        glyphicons-halflings-regular.eot
        glyphicons-halflings-regular.woff2
        glyphicons-halflings-regular.ttf
        glyphicons-halflings-regular.svg
      - stylesheets
        main.css
        bootstrap.min.css
    - app
      - views
        main.scala.html
        index.scala.html
        playlist.scala.html
        analyze.scala.html
      - models
        Songs.scala
        Library.scala
        Playlist.scala
      - svc
        SparkSvc.scala
        AnalyzerSvc.scala
        CassandraDao.scala
      - controllers
        Analyze.scala
        Index.scala
        Playlist.scala
    - conf
      - application.conf
      - routes
  - pom.xml
  - spark-job-server
    - local.sh
    - local.conf
    - example.sh
  - recommender-core
    - src
      - main
        resources
        notes
        scala
        io
        gzet
        recommender
        PlaylistBuilder.scala
        TestSparkServer.scala
        Audio.scala
        Config.scala
        Record.scala
        PersonalizedPlaylistBuilder.scala
        IndexBuilder.scala
        AudioLibrary.scala
    - pom.xml
  - .gitignore
- Chapter13
  - src
    - main
      - scala
        io
        gzet
        utils
        DataManipulation.scala
      - java
        org
        apache
        hadoop
        io
        compress
        crypto
        CryptoDecompressor.java
        CryptoCompressor.java
        CryptoCodec.java
        EncryptionUtils.java
    - test
      - resources
        gdeltTestFile.csv
      - scala
        io
        gzet
        CryptoTest.scala
  - pom.xml
- geomesa-utils-1.5
  - src
    - main
      - resources
        gdeltEventsTestFile.csv
        log4j.properties
      - scala
        io
        gzet
        geomesa
        ingest
        GeomesaAccumuloRead.scala
        GeomesaAccumuloWrite.scala
      - java
        com
        example
        geomesa
        gdelt
        GDELTIngest.java
        README.md
        GDELTIngestMapper.java
        LICENSE.txt
  - pom.xml
  - README.md
- LICENSE
- Chapter12
  - src
    - main
      - resources
        Brent-oil-prices.csv
      - scala
        io
        gzet
        mainClass.scala
        HighLowCalc.scala
  - pom.xml
- Chapter10
  - story-clustering-core
    - src
      - main
        resources
        application.conf
        articles.json
        gdelt.sh
        stories.json
        scala
        io
        gzet
        story
        SimhashIndexing.scala
        StoryStreamClustering.scala
        SimpleConfig.scala
        clustering
        SimhashPIClustering.scala
        LDAClustering.scala
        SimhashCanopyClustering.scala
        KMeansClustering.scala
        StoryBatchDedup.scala
        StoryJSONExtractor.scala
        linalg
        XORShiftRandom.scala
        Embedding.scala
        RandomIndexEmbedding.scala
    - pom.xml
  - pom.xml
  - story-clustering-web
    - pom.xml
    - app
      - io
        gzet
        story
        web
        dao
        CassandraDao.scala
        SimpleConfig.scala
        controllers
        SimHash.scala
    - conf
      - application.conf
      - routes
  - story-clustering-utils
    - src
      - main
        resources
        stopwords
        scala
        io
        gzet
        story
        util
        Tokenizer.scala
        HtmlFetcher.scala
        SimhashUtils.scala
        Stopwords.scala
        model
        GKG.scala
        Cluster.scala
        Article.scala
        Duplicate.scala
        Content.scala
    - pom.xml
  - .gitignore
- gzet-parent
  - NOTICE.txt
  - gzet.iml
  - pom.xml
  - contributors.txt
  - gzet-archetype
    - src
      - main
        resources
        archetype-resources
        src
        main
        scala
        io
        gzet
        App.scala
        test
        scala
        io
        gzet
        AppTest.scala
        pom.xml
        gzet-archetype.iml
        META-INF
        maven
        archetype-metadata.xml
      - test
        resources
        projects
        basic
        archetype.properties
        goal.txt
    - pom.xml
    - gzet-archetype.iml
    - gzet-archetype-archetype.iml
    - .project
  - README.md
  - gzet-test
    - src
      - test
        scala
        io
        gzet
        test
        SparkFunSuite.scala
        GzetTest.scala
    - gzet-test.iml
    - pom.xml
    - .project
  - LICENSE.txt
- README.md
- Chapter05
  - src
    - main
      - resources
        Brent-oil-prices-2016.csv
        CAMEO_codes.txt
      - scala
        io
        gzet
        geomesa
        OilPriceFunc.scala
        PredictOilPrice.scala
        geohash
        GeoHash.scala
        LICENSE
        Calate.scala
        Base32.scala
  - pom.xml
  - README.md
- Chapter03
  - src
    - main
      - avro
        gdelt-gpg-21.avdl
      - resources
        20160101020000.gkg.csv
      - scala
        io
        gzet
        oldapi
        createAvroWithIDL.scala
        newapi
        createAvroWithStructs.scala
        CreateAvroWithCase.scala
    - test
      - scala
        io
        gzet
        oldapi
        TestOldApiFunctionality.scala
        newapi
        TestNewApiWithStructs.scala
        TestNewApiWithCaseClass.scala
  - pom.xml
  - GdeltAvroParquet.iml
  - extract_gcam_timeseries.json
- Chapter07
  - src
    - main
      - resources
        application.conf
      - scala
        io
        gzet
        community
        GzetCommunities.scala
        elasticsearch
        ESConfig.scala
        ESReader.scala
        util
        GzetPersons.scala
        clustering
        wcc
        WCCDetection.scala
        VState.scala
        Clustering.scala
        louvain
        LouvainDetection.scala
        VState.scala
        GzetLoader.scala
        accumulo
        AccumuloReader.scala
        AccumuloConfig.scala
        AccumuloLoader.scala
      - java
        io
        gzet
        community
        accumulo
        EdgeWritable.java
        AccumuloGraphxInputFormat.java
    - test
      - resources
        local-nodes.csv
        local-edges.csv
        local.gephi
      - scala
        io
        gzet
        community
        ESReaderIT.scala
        GzetCommunitiesTest.scala
        AccumuloIT.scala
  - pom.xml
  - contributors.txt
  - .gitignore
  - LICENSE.txt
- Chapter11
  - src
    - main
      - resources
        states
        application.conf
        twitter.properties
      - scala
        io
        gzet
        timeseries
        graph
        Godwin.scala
        twitter
        Twitter.scala
        SimpleConfig.scala
        TwitterStream.scala
        timely
        TimelyImplicits.scala
        MetricImplicits.scala
        TwitterBatch.scala
        TwitterWord2Vec.scala
        TwitterBatchTimely.scala
      - java
        io
        gzet
        timeseries
        twitter
        FetchTweets.java
        timely
        AccumuloTimelyInputFormat.java
        TimelyWritable.java
    - test
      - resources
        edges.csv
      - scala
        io
        gzet
        timeseries
        graph
        GodwinTest.scala
        twitter
        TwitterSentiment.scala
  - pom.xml
  - contributors.txt
  - .gitignore
  - LICENSE.txt
- .gitignore
- Chapter04
  - profilers
    - NOTICE.txt
    - src
      - main
        resources
        application.conf
        gdelt-stream.sh
        ascii.csv
        scala
        io
        gzet
        profilers
        GdeltStructuralProfiler.scala
        field
        CardinalityProfiler.scala
        MaskBasedProfiler.scala
        EmptinessProfiler.scala
        Utils.scala
        Report.scala
        CSVProfiler.scala
        raw
        AsciiProfiler.scala
        RowProfiler.scala
        StructuralProfiler.scala
      - test
        resources
        logback-test.xml
        20160803133000.export.CSV
        scala
        io
        gzet
        AsciiProfilerTest.scala
        CardinalityProfilerTest.scala
        StructuralProfilerTest.scala
        EmptinessProfilerTest.scala
        MaskProfilerTest.scala
    - pom.xml
    - contributors.txt
    - README.md
    - LICENSE.txt
  - Chapter 4.d GeoGCAM - Build Pipeline POC.json
- Chapter06
  - scraper
    - NOTICE.txt
    - src
      - main
        resources
        nicknames.dat
        application.conf
        scala
        io
        gzet
        EntityRecognition.scala
        StringDeduplication.scala
        NameDeduplication.scala
        HtmlFetcher.scala
        App.scala
        GeoLookup.scala
        OrganizationDeduplication.scala
      - test
        resources
        logback-test.xml
        corpus
        load-geoname-test
        organizations
        persons
        places
        scala
        io
        gzet
        GeoLookupTest.scala
        OrganizationDeduplicationTest.scala
        NameDeduplicationTest.scala
        EntityRecognitionTest.scala
    - resources
      - output.json
    - pom.xml
    - README.md
    - LICENSE.txt
  - goose
    - src
      - main
        resources
        com
        gravity
        goose
        images
        known-image-css.txt
        statichtml
        espn1.txt
        wsj1.txt
        msn1_result.txt
        engadget1.txt
        cnbc1.txt
        guardian1_result.txt
        issue_24.txt
        issue_25.txt
        wired1.txt
        cnet1.txt
        issue_28.txt
        businessinsider2.txt
        businessinsider1.txt
        msn1.txt
        issue_24_result.txt
        issue_32.txt
        businessweek3.txt
        aol1.txt
        time2.txt
        usatoday1.txt
        politico1.txt
        businessweek2.txt
        cnn1.txt
        businessweek1.txt
        time1.txt
        techcrunch1.txt
        foxnews1.txt
        text
        stopwords-en.txt
        scala
        com
        gravity
        goose
        network
        HttpExceptions.scala
        MaxBytesException.scala
        HtmlFetcher.scala
        AbstractHtmlFetcher.scala
        NotHtmlException.scala
        Goose.scala
        utils
        FileHelper.scala
        URLHelper.scala
        Logging.scala
        TalkToMeGoose.scala
        images
        ImageExtractor.scala
        ImageUtils.scala
        StandardImageExtractor.scala
        UpgradedImageIExtractor.scala
        ImageSaver.scala
        SecretGifException.scala
        Image.scala
        ImageDetails.scala
        cleaners
        StandardDocumentCleaner.scala
        DocumentCleaner.scala
        Configuration.scala
        Article.scala
        extractors
        StandardContentExtractor.scala
        TagsEvaluator.scala
        PublishDateExtractor.scala
        ContentExtractor.scala
        Extractor.scala
        AdditionalDataExtractor.scala
        Crawler.scala
        outputformatters
        OutputFormatter.scala
        StandardOutputFormatter.scala
        text
        string.scala
        StringSplitter.scala
        StopWords.scala
        ReplaceSequence.scala
        WordStats.scala
        HashUtils.scala
        StringReplacement.scala
      - test
        resources
        log4j.properties
        akka.conf
        scala
        com
        gravity
        goose
        utils
        URLHelperTest.scala
        FileHelperTest.scala
        GoldSitesTestIT.scala
        ExtractionsTest.scala
        images
        ImageUtilsIT.scala
        GooseTest.scala
        extractors
        VoicesContentExtractor.scala
        ArticleTest.scala
        text
        HashUtilsTest.scala
        TestUtils.scala
        PerformanceTester.scala
    - pom.xml
    - LICENSE
    - README.md
    - NOTICE
    - .gitignore
- Chapter09
  - src
    - main
      - resources
        create-es-mapping.sh
        application.conf
        create-kafka.sh
        gdelt-stream.sh
      - scala
        io
        gzet
        tagging
        KappaTagging.scala
        html
        HtmlHandler.scala
        twitter
        TwitterHIS.scala
        gdelt
        GdeltTagger.scala
        classifier
        Classifier.scala
        stackoverflow
        StackBootstraping.scala
  - pom.xml
  - .gitignore
  - LICENSE.txt

package io.gzet.timeseries

import java.sql.Timestamp

import com.cloudera.sparkts.{DateTimeIndex, TimeSeriesRDD}
import io.gzet.timeseries.timely.MetricImplicits._
import io.gzet.timeseries.timely.TimelyImplicits._
import io.gzet.timeseries.twitter.Twitter._
import io.gzet.utils.spark.accumulo.AccumuloConfig
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.joda.time.{DateTime, Minutes, Period}

object TwitterBatchTimely extends SimpleConfig {

  case class Observation(
                          hashtag: String,
                          time: Timestamp,
                          count: Double
                        )

  def main(args: Array[String]) = {

    val sparkConf = new SparkConf().setAppName("Twitter Extractor")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val twitterJsonRDD = sc.textFile("file:///Users/antoine/CHAPTER/twitter-trump", 500)
    val tweetRDD = twitterJsonRDD mapPartitions analyzeJson cache()

    // Publish metrics to Timely
    tweetRDD.count()
    tweetRDD.countByState.publish()
    tweetRDD.sentimentByState.publish()

    // Read metrics from Timely
    val conf = AccumuloConfig("GZET", "alice", "alice", "localhost:2181")
    val metricsRDD = sc.timely(conf, Some("io.gzet.count"))

    val minDate = metricsRDD.map(_.time).min()
    val maxDate = metricsRDD.map(_.time).max()

    class TwitterFrequency(val minutes: Int) extends com.cloudera.sparkts.PeriodFrequency(Period.minutes(minutes)) {
      def difference(dt1: DateTime, dt2: DateTime): Int = Minutes.minutesBetween(dt1, dt2).getMinutes / minutes
      override def toString: String = s"minutes $minutes"
    }

    val dtIndex = DateTimeIndex.uniform(minDate, maxDate, new TwitterFrequency(1))

    val metricsDF = metricsRDD.filter({
      metric =>
        metric.tags.keys.toSet.contains("tag")
    }).flatMap({
      metric =>
        metric.tags map {
          case (k, v) =>
            ((v, roundFloorMinute(metric.time, 1)), metric.value)
        }
    }).reduceByKey(_+_).map({
      case ((metric, time), sentiment) =>
        Observation(metric, new Timestamp(time), sentiment)
    }).toDF()

    val tsRDD = TimeSeriesRDD.timeSeriesRDDFromObservations(dtIndex, metricsDF, "time", "hashtag", "count").filter(_._2.toArray.exists(!_.isNaN))

  }

  def roundFloorMinute(time: Long, windowMinutes: Int) = {
    val dt = new DateTime(time)
    dt.withMinuteOfHour((dt.getMinuteOfHour / windowMinutes) * windowMinutes).minuteOfDay().roundFloorCopy().toDate.getTime
  }

}