package trident; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.log4j.Logger; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.LocalDRPC; import backtype.storm.StormSubmitter; import backtype.storm.generated.StormTopology; import backtype.storm.task.IMetricsContext; import backtype.storm.tuple.Fields; import storm.trident.TridentState; import storm.trident.TridentTopology; import storm.trident.state.State; import storm.trident.state.StateFactory; import trident.aggregators.CountAggKeep; import trident.aggregators.Decider; import trident.functions.ComputeDistance; import trident.functions.ExpandList; import trident.functions.Extractor; import trident.functions.TextProcessor; import trident.functions.VectorBuilder; import trident.state.query.BucketsStateQuery; import trident.state.query.RecentTweetsStateQuery; import trident.state.BucketsDB; import trident.state.RecentTweetsDB; import trident.utils.FirstNAggregator; import twitter4j.Status; import twitter4j.TwitterException; import twitter4j.json.DataObjectFactory; /** * Main class to run FSD */ public class FirstStoryDetection { public final static String TOPOLOGY_NAME = "fsd"; private static final Logger LOG = Logger .getLogger(FirstStoryDetection.class); public static class BucketsStateFactory implements StateFactory { int partialL, k, queueSize; public BucketsStateFactory(int partialL, int k, int queueSize) { this.partialL = partialL; this.k = k; this.queueSize = queueSize; } @Override public State makeState(Map conf, IMetricsContext metrics, int partitionIndex, int numPartitions) { return new BucketsDB(partialL, k, queueSize); } } public static class RecentTweetsStateFactory implements StateFactory { @Override public State makeState(Map conf, IMetricsContext metrics, int partitionIndex, int numPartitions) { return new RecentTweetsDB(Integer.valueOf((String) conf .get("RECENT_TWEETS_TO_COMPARE_WITH")), numPartitions); } } public static StormTopology buildTopology(LocalDRPC drpc) { TridentTopology topology = new TridentTopology(); Properties prop = new Properties(); int partialL = 0, k = 0, queueSize = 0, bucketsParallelism = 0, L = 0, computeDistance = 0, recentTweetsParalellism = 0; try { // load a properties file FileInputStream finputstream = new FileInputStream( "config.properties"); prop.load(finputstream); L = Integer.valueOf(prop.getProperty("L")); // partial L will give the number of buckets each thread (given by // parallelism hint) will hold partialL = L / Integer.valueOf(prop.getProperty("BucketsParallelism")); k = Integer.valueOf(prop.getProperty("k")); queueSize = Integer.valueOf(prop.getProperty("QUEUE_SIZE")); bucketsParallelism = Integer.valueOf(prop .getProperty("BucketsParallelism")); computeDistance = Integer.valueOf(prop .getProperty("ComputeDistance")); recentTweetsParalellism = Integer.valueOf(prop .getProperty("RecentTweetsStateQuery")); finputstream.close(); } catch (IOException ex) { System.err.println(ex); } TridentState bucketsDB = topology .newStaticState(new BucketsStateFactory(partialL, k, queueSize)); TridentState recentTweetsDB = topology .newStaticState(new RecentTweetsStateFactory()); // Comment out the debug rows to get debugging topology.newDRPCStream(TOPOLOGY_NAME, drpc) .each(new Fields("args"), new TextProcessor(), new Fields("textProcessed")) .each(new Fields("textProcessed"), new VectorBuilder(), new Fields("tweet_obj", "uniqWordsIncrease")) // .each(new Fields("tweet_obj", "uniqWordsIncrease"), new // Debug()); .broadcast() .stateQuery(bucketsDB, new Fields("tweet_obj", "uniqWordsIncrease"), new BucketsStateQuery(), new Fields("tw_id", "collidingTweetsList")) .parallelismHint(bucketsParallelism) .each(new Fields("collidingTweetsList"), new ExpandList(), new Fields("coltweet_obj", "coltweetId")) .groupBy(new Fields("tw_id", "coltweetId")) .aggregate( new Fields("coltweetId", "tweet_obj", "coltweet_obj"), new CountAggKeep(), new Fields("count", "tweet_obj", "coltweet_obj")) // how many times each colliding tweet // is seen per tweet. // CountAggKeep keeps the fields we passed in the config map // (tweet_obj in our case) // .each(new Fields("tw_id", "coltweetId", "tweet_obj", // "count"), // new Debug()); .groupBy(new Fields("tw_id")) .aggregate( new Fields("count", "coltweetId", "tweet_obj", "coltweet_obj"), new FirstNAggregator(3 * L, "count", true), new Fields("countAfter", "coltweetId", "tweet_obj", "coltweet_obj")) .each(new Fields("tw_id", "coltweetId", "tweet_obj", "coltweet_obj"), new ComputeDistance(), new Fields("cosSim")) .parallelismHint(computeDistance) // .each(new Fields("tw_id", "coltweetId", "cosSim"), new // Debug()); .shuffle() .groupBy(new Fields("tw_id")) // find closest neighbor .aggregate( new Fields("coltweetId", "tweet_obj", "coltweet_obj", "cosSim"), new FirstNAggregator(1, "cosSim", true), // give only // the // closest // neighbor new Fields("coltweetId", "tweet_obj", "coltweet_obj", "cosSimBckts")) // .each(new Fields("tw_id", "coltweetId", "cosSimBckts"), new // Debug()); .broadcast() // tweet should go to all partitions to parallelize the task .stateQuery(recentTweetsDB, new Fields("tweet_obj", "cosSimBckts"), new RecentTweetsStateQuery(), new Fields("nnRecentTweet")) .parallelismHint(recentTweetsParalellism) // .each(new Fields("tw_id", "cosSimBckts", "nnRecentTweets"), // new Debug()); .groupBy(new Fields("tw_id")) .aggregate( new Fields("coltweetId", "tweet_obj", "coltweet_obj", "cosSimBckts", "nnRecentTweet"), new Decider(), new Fields("nn")) .each(new Fields("nn"), new Extractor(), new Fields("colId", "col_txt", "cos")) .project(new Fields("colId", "col_txt", "cos")); // .each(new Fields("tw_id", "nn"), new Debug()); return topology.build(); } private static Config createTopologyConfiguration(Properties prop, boolean localMode) { Config conf = new Config(); List<String> dprcServers = new ArrayList<String>(); dprcServers.add("localhost"); conf.put(Config.DRPC_SERVERS, dprcServers); conf.put(Config.DRPC_PORT, 3772); if (!localMode) conf.put(Config.STORM_CLUSTER_MODE, new String("distributed")); conf.put("UNIQUE_WORDS_EXPECTED", prop.getProperty("UNIQUE_WORDS_EXPECTED")); conf.put("PATH_TO_OOV_FILE", prop.getProperty("PATH_TO_OOV_FILE")); conf.put("L", prop.getProperty("L")); conf.put("BucketsParallelism", prop.getProperty("BucketsParallelism")); conf.put("k", prop.getProperty("k")); conf.put("QUEUE_SIZE", prop.getProperty("QUEUE_SIZE")); List<String> countAggKeepFields = new ArrayList<String>(); countAggKeepFields.add("tweet_obj"); countAggKeepFields.add("coltweet_obj"); conf.put("countAggKeepFields", countAggKeepFields); conf.put("THRESHOLD", prop.getProperty("THRESHOLD")); conf.put("RECENT_TWEETS_TO_COMPARE_WITH", prop.getProperty("RECENT_TWEETS_TO_COMPARE_WITH")); conf.setDebug(false); conf.setNumWorkers(Integer.valueOf((String) prop .get("NUMBER_OF_WORKERS"))); conf.setMaxSpoutPending(50000000); return conf; } public static void main(String[] args) throws Exception { Properties prop = new Properties(); String queryFile = null; FileOutputStream fos = null; try { // load a properties file FileInputStream finputstream = new FileInputStream( "config.properties"); prop.load(finputstream); queryFile = prop.getProperty("PATH_TO_QUERY_FILE"); fos = new FileOutputStream(new File( prop.getProperty("PATH_TO_OUTPUT_FILE"))); finputstream.close(); } catch (IOException ex) { System.err.println(ex); } if (args == null || args.length == 0) { Config conf = createTopologyConfiguration(prop, true); LocalDRPC drpc = new LocalDRPC(); LocalCluster cluster = new LocalCluster(); cluster.submitTopology(TOPOLOGY_NAME, conf, buildTopology(drpc)); Thread.sleep(2000); // give it some time to setup BufferedReader br = new BufferedReader(new FileReader(queryFile)); String tweetJson; fos.write("Start: ".getBytes()); fos.write(String.valueOf(System.currentTimeMillis()).getBytes()); byte[] newLine = "\n".getBytes(); int times = 0; // emit tweets into topology while ((tweetJson = br.readLine()) != null) { String result = drpc.execute(TOPOLOGY_NAME, tweetJson); Status s = null; try { s = DataObjectFactory.createStatus(tweetJson); result = s.getId() + "\t" + s.getText() + "\t" + result; } catch (TwitterException e) { LOG.error(e.toString()); } fos.write(result.getBytes()); fos.write(newLine); // times++; // if (times == 1000) // break; } fos.write(newLine); fos.write("Finish: ".getBytes()); fos.write(String.valueOf(System.currentTimeMillis()).getBytes()); fos.flush(); fos.close(); br.close(); drpc.shutdown(); cluster.shutdown(); } else { // distributed mode Config conf = createTopologyConfiguration(prop, false); LocalDRPC drpc = null; StormSubmitter.submitTopology(args[0], conf, buildTopology(drpc)); } } }