package tutorial.storm.trident.example; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.generated.StormTopology; import backtype.storm.spout.SchemeAsMultiScheme; import backtype.storm.tuple.Fields; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import storm.kafka.BrokerHosts; import storm.kafka.KafkaConfig; import storm.kafka.StringScheme; import storm.kafka.ZkHosts; import storm.kafka.trident.TransactionalTridentKafkaSpout; import storm.kafka.trident.TridentKafkaConfig; import storm.trident.Stream; import storm.trident.TridentTopology; import tutorial.storm.trident.operations.*; import tutorial.storm.trident.testutil.TestUtils; import java.io.IOException; /** * This is a really simple example on how do joins between streams with Trident. * * @author Davide Palmisano ([email protected]) */ public class JoinExample { public static StormTopology buildTopology(TransactionalTridentKafkaSpout spout) throws IOException { TridentTopology topology = new TridentTopology(); /** * First, grab the tweets stream. We're going to use it in two different places * and then, we'll going to join them. * */ Stream contents = topology .newStream("tweets", spout) .each(new Fields("str"), new ParseTweet(), new Fields("text", "content", "user")); /** * Now, let's select and project only hashtags for each tweet. * This stream is basically a list of couples (tweetId, hashtag). * */ Stream hashtags = contents .each(new Fields("content"), new OnlyHashtags()) .each(new Fields("content"), new TweetIdExtractor(), new Fields("tweetId")) .each(new Fields("content"), new GetContentName(), new Fields("hashtag")) .project(new Fields("hashtag", "tweetId")); //.each(new Fields("content", "tweetId"), new DebugFilter()); /** * And let's do the same for urls, obtaining a stream of couples * like (tweetId, url). * */ Stream urls = contents .each(new Fields("content"), new OnlyUrls()) .each(new Fields("content"), new TweetIdExtractor(), new Fields("tweetId")) .each(new Fields("content"), new GetContentName(), new Fields("url")) .project(new Fields("url", "tweetId")); //.each(new Fields("content", "tweetId"), new DebugFilter()); /** * Now is time to join on the tweetId to get a stream of triples (tweetId, hashtag, url). * */ topology.join(hashtags, new Fields("tweetId"), urls, new Fields("tweetId"), new Fields("tweetId", "hashtag", "url")) .each(new Fields("tweetId", "hashtag", "url"), new Print()); return topology.build(); } public static void main(String[] args) throws Exception { Config conf = new Config(); if (args.length == 2) { // Ready & submit the topology String name = args[0]; BrokerHosts hosts = new ZkHosts(args[1]); TransactionalTridentKafkaSpout kafkaSpout = TestUtils.testTweetSpout(hosts); StormSubmitter.submitTopology(name, conf, buildTopology(kafkaSpout)); }else{ System.err.println("<topologyName> <zookeeperHost>"); } } }