Java Code Examples for org.apache.storm.tuple.Tuple#getBinaryByField()

The following examples show how to use org.apache.storm.tuple.Tuple#getBinaryByField() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ProfileSplitterBolt.java    From metron with Apache License 2.0 6 votes vote down vote up
private void doExecute(Tuple input) throws ParseException, UnsupportedEncodingException {

    // retrieve the input message
    byte[] data = input.getBinaryByField(VALUE.getFieldName());
    if(data == null) {
      LOG.debug("Received null message. Nothing to do.");
      return;
    }

    // ensure there is a valid profiler configuration
    ProfilerConfig config = getProfilerConfig();
    if(config == null || getProfilerConfig().getProfiles().size() == 0) {
      LOG.debug("No Profiler configuration found. Nothing to do.");
      return;
    }

    JSONObject message = (JSONObject) parser.parse(new String(data, StandardCharsets.UTF_8));
    routeMessage(input, message, config);
  }
 
Example 2
Source File: RedirectionBolt.java    From storm-crawler with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    String url = tuple.getStringByField("url");
    byte[] content = tuple.getBinaryByField("content");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");
    String text = tuple.getStringByField("text");

    Values v = new Values(url, content, metadata, text);

    // if there is a text - no need to parse it again
    if (StringUtils.isNotBlank(text)) {
        collector.emit(tuple, v);
    } else {
        collector.emit("tika", tuple, v);
    }

    collector.ack(tuple);
}
 
Example 3
Source File: FeedDetectorBolt.java    From news-crawl with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");

    boolean isFeed = Boolean.valueOf(metadata.getFirstValue(isFeedKey));

    if (!isFeed) {
        String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
        if (ct != null) {
            for (String clue : mimeTypeClues) {
                if (ct.contains(clue)) {
                    isFeed = true;
                    metadata.setValue(isFeedKey, "true");
                    LOG.info("Feed detected from content type <{}> for {}",
                            ct, url);
                    break;
                }
            }
        }
    }

    if (!isFeed) {
        if (contentDetector.matches(content)) {
            isFeed = true;
            metadata.setValue(isFeedKey, "true");
            LOG.info("Feed detected from content: {}", url);
        }
    }

    if (isFeed) {
        // do not parse but run parse filters
        ParseResult parse = new ParseResult();
        ParseData parseData = parse.get(url);
        parseData.setMetadata(metadata);
        parseFilters.filter(url, content, null, parse);
        // emit status
        collector.emit(Constants.StatusStreamName, tuple,
                new Values(url, metadata, Status.FETCHED));
    } else {
        // pass on
        collector.emit(tuple, tuple.getValues());
    }
    collector.ack(tuple);
}
 
Example 4
Source File: NewsSiteMapDetectorBolt.java    From news-crawl with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");

    boolean isSitemap = Boolean.valueOf(
            metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
    boolean isNewsSitemap = Boolean.valueOf(
            metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));

    if (!isNewsSitemap || !isSitemap) {
        int match = contentDetector.getFirstMatch(content);
        if (match >= 0) {
            // a sitemap, not necessarily a news sitemap
            isSitemap = true;
            metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
            if (match <= NewsSiteMapParserBolt.contentCluesSitemapNewsMatchUpTo) {
                isNewsSitemap = true;
                LOG.info("{} detected as news sitemap based on content",
                        url);
                metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey,
                        "true");
            }
        }
    }

    if (isSitemap) {
        // do not parse but run parse filters
        ParseResult parse = new ParseResult();
        ParseData parseData = parse.get(url);
        parseData.setMetadata(metadata);
        parseFilters.filter(url, content, null, parse);
        // emit status
        collector.emit(Constants.StatusStreamName, tuple,
                new Values(url, metadata, Status.FETCHED));
    } else {
        // pass on
        collector.emit(tuple, tuple.getValues());
    }
    collector.ack(tuple);
}