package trident.functions; import java.lang.reflect.InvocationTargetException; import java.util.Map; import org.apache.log4j.Logger; import storm.trident.operation.BaseFunction; import storm.trident.operation.TridentCollector; import storm.trident.operation.TridentOperationContext; import storm.trident.tuple.TridentTuple; import trident.utils.Tools; import trident.utils.TweetBuilder; import twitter4j.Status; import twitter4j.TwitterException; import twitter4j.json.DataObjectFactory; import backtype.storm.tuple.Values; import entities.Tweet; /** * Processes the tweet text to remove whitespaces, links and replies. * */ public class TextProcessor extends BaseFunction{ private TweetBuilder tb; private Tools tools; private static final Logger LOG = Logger.getLogger(TextProcessor.class); @Override public void prepare(Map conf, TridentOperationContext context) { tools = new Tools(); tb = new TweetBuilder((String) conf.get("PATH_TO_OOV_FILE")); } @Override public void execute(TridentTuple tuple, TridentCollector collector) { Status s = null; String tweetText = null; try { s = DataObjectFactory.createStatus((String) tuple.getValue(0)); tweetText = tools.removeLinksAndReplies(tb.removeSpacesInBetween(s.getText())); } catch (Exception e) { LOG.error(e.toString()); } Tweet t = null; if (s!=null) //rarely Twitter4J can't parse the json to convert to Status and Status is null. t = new Tweet(s.getId(), tweetText); else t = new Tweet(-1, " "); collector.emit(new Values(t)); } }