Java Code Examples for org.apache.storm.tuple.Tuple#getStringByField()

The following examples show how to use org.apache.storm.tuple.Tuple#getStringByField() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RedirectionBolt.java    From storm-crawler with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    String url = tuple.getStringByField("url");
    byte[] content = tuple.getBinaryByField("content");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");
    String text = tuple.getStringByField("text");

    Values v = new Values(url, content, metadata, text);

    // if there is a text - no need to parse it again
    if (StringUtils.isNotBlank(text)) {
        collector.emit(tuple, v);
    } else {
        collector.emit("tika", tuple, v);
    }

    collector.ack(tuple);
}
 
Example 2
Source File: TestBolt.java    From springBoot-study with Apache License 2.0 6 votes vote down vote up
/**
	 * execute()方法是Bolt实现的核心。
	 * 也就是执行方法,每次Bolt从流接收一个订阅的tuple,都会调用这个方法。
	 */
	@Override
	public void execute(Tuple tuple) {
		/**
		 * 接受消息可以使用这两种方式进行接收。
		 * 个人推荐第二种。
		 */
//		String msg=tuple.getString(0);
		String msg=tuple.getStringByField("test");
		//这里我们就不做消息的处理,只打印
	    System.out.println("Bolt第"+count+"接受的消息:"+msg);	
	    count++;
	    /**
	     * 
         * 没次调用处理一个输入的tuple,所有的tuple都必须在一定时间内应答。
         * 可以是ack或者fail。否则,spout就会重发tuple。
         * 如果继承的是IRichBolt,则需要手动ack。
         * 这里就不用了,BaseRichBolt会自动帮我们应答。
	     */
//	    collector.ack(tuple);
	}
 
Example 3
Source File: InsertBolt.java    From springBoot-study with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(Tuple tuple) {
	String msg=tuple.getStringByField(Constants.FIELD);
	try{
		List<User> listUser =JSON.parseArray(msg,User.class);
		//移除age小于10的数据
		if(listUser!=null&&listUser.size()>0){
			Iterator<User> iterator = listUser.iterator();
			 while (iterator.hasNext()) {
				 User user = iterator.next();
				 if (user.getAge()<10) {
					 logger.warn("Bolt移除的数据:{}",user);
					 iterator.remove();
				 }
			 }
			if(listUser!=null&&listUser.size()>0){
				userService.insertBatch(listUser);
			}
		}
	}catch(Exception e){
		logger.error("Bolt的数据处理失败!数据:{}",msg,e);
	}
}
 
Example 4
Source File: HeartbeatHandler.java    From DBus with Apache License 2.0 6 votes vote down vote up
@Override
public void handle(Tuple tuple) {
    EmitData data = (EmitData) tuple.getValueByField(Constants.EmitFields.DATA);
    List<PairWrapper<String, Object>> wrapperList = data.get(EmitData.MESSAGE);
    if (wrapperList != null && !wrapperList.isEmpty()) {
        for (PairWrapper<String, Object> wrapper : wrapperList) {
            HeartbeatPulse pulse = HeartbeatPulse.build(wrapper.pairs2map());
            if (logger.isDebugEnabled()) {
                Object offset = data.get(EmitData.OFFSET);
                HeartBeatPacket packet = HeartBeatPacket.parse(pulse.getPacket());
                SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
                String groupId = tuple.getStringByField(Constants.EmitFields.GROUP_FIELD);
                logger.debug("[heartbeat] {} offset:{} ts:{}, time:{}", groupId, offset == null ? -1 : offset, packet.getTxtime(), format.format(new Date(packet.getTxtime())));
            }
            reporter.mark(pulse);
        }
    }

    handler.handle(tuple);
    this.listener.getOutputCollector().ack(tuple);
}
 
Example 5
Source File: TestBolt.java    From java-study with Apache License 2.0 6 votes vote down vote up
/**
	 * execute()方法是Bolt实现的核心。
	 * 也就是执行方法,每次Bolt从流接收一个订阅的tuple,都会调用这个方法。
	 */
	@Override
	public void execute(Tuple tuple) {
		/**
		 * 接受消息可以使用这两种方式进行接收。
		 * 个人推荐第二种。
		 */
//		String msg=tuple.getString(0);
		String msg=tuple.getStringByField("test");
		//这里我们就不做消息的处理,只打印
	    System.out.println("Bolt第"+count+"接受的消息:"+msg);	
	    count++;
	    /**
	     * 
         * 没次调用处理一个输入的tuple,所有的tuple都必须在一定时间内应答。
         * 可以是ack或者fail。否则,spout就会重发tuple。
         * 如果继承的是IRichBolt,则需要手动ack。
         * 这里就不用了,BaseRichBolt会自动帮我们应答。
	     */
//	    collector.ack(tuple);
	}
 
Example 6
Source File: LocalWordCountRedisStormTopology.java    From 163-bigdate-note with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 1.获取每个单词
 * 2.对所有单词进行汇总
 * 3.输出
 *
 * @param input
 */
@Override
public void execute(Tuple input) {
    //获取每个单词
    String word = input.getStringByField("word");

    //对所有单词汇总
    Integer count = map.get(word);
    if (count == null) {
        count = 0;
    }
    count++;

    map.put(word, count);

    //输出
    this.collector.emit(new Values(word, map.get(word)));
}
 
Example 7
Source File: AdvertisingTopology.java    From yahoo-streaming-benchmark with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    String ad_id = tuple.getStringByField("ad_id");
    String campaign_id = this.redisAdCampaignCache.execute(ad_id);
    if(campaign_id == null) {
        _collector.fail(tuple);
        return;
    }
    _collector.emit(tuple, new Values(campaign_id,
                                      tuple.getStringByField("ad_id"),
                                      tuple.getStringByField("event_time")));
    _collector.ack(tuple);
}
 
Example 8
Source File: AdvertisingTopology.java    From yahoo-streaming-benchmark with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(Tuple tuple) {

    String campaign_id = tuple.getStringByField("campaign_id");
    String event_time = tuple.getStringByField("event_time");

   this.campaignProcessorCommon.execute(campaign_id, event_time);
    _collector.ack(tuple);
}
 
Example 9
Source File: TestBolt.java    From java-study with Apache License 2.0 5 votes vote down vote up
/**
 * execute()方法是Bolt实现的核心。
 * 也就是执行方法,每次Bolt从流接收一个订阅的tuple,都会调用这个方法。
 */
@Override
public void execute(Tuple tuple) {
	String msg=tuple.getStringByField("word");
    System.out.println("开始分割单词:"+msg);
       String[] words = msg.toLowerCase().split(" ");
       for (String word : words) {
           this.collector.emit(new Values(word));//向下一个bolt发射数据
       } 

}
 
Example 10
Source File: AdvertisingTopology.java    From streaming-benchmarks with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(Tuple tuple) {

    String campaign_id = tuple.getStringByField("campaign_id");
    String event_time = tuple.getStringByField("event_time");

   this.campaignProcessorCommon.execute(campaign_id, event_time);
    _collector.ack(tuple);
}
 
Example 11
Source File: SplitSentenceBolt.java    From java-study with Apache License 2.0 5 votes vote down vote up
/**
 * SplitSentenceBolt核心功能是在类IBolt定义execute()方法,这个方法是IBolt接口中定义。
 * 每次Bolt从流接收一个订阅的tuple,都会调用这个方法。
 * 本例中,收到的元组中查找“sentence”的值,
 * 并将该值拆分成单个的词,然后按单词发出新的tuple。
 */
public void execute(Tuple input) {
    // TODO Auto-generated method stub
    String sentence = input.getStringByField("sentence");
    String[] words = sentence.split(" ");
    for (String word : words) {
        this.collector.emit(new Values(word));//向下一个bolt发射数据
    }       
}
 
Example 12
Source File: ReportBolt.java    From java-study with Apache License 2.0 5 votes vote down vote up
public void execute(Tuple input) {
    // TODO Auto-generated method stub

    String word = input.getStringByField("word");
    Long count = input.getLongByField("count");
    this.counts.put(word, count);

    //实时输出
    System.out.println("结果:"+this.counts);
}
 
Example 13
Source File: LocalWordCountStormTopology.java    From 163-bigdate-note with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 对line按照逗号进行分割
 *
 * @param input
 */
@Override
public void execute(Tuple input) {
    String line = input.getStringByField("line");
    String[] words = line.split(" ");
    for (String word : words) {
        this.collector.emit(new Values(word));
    }
}
 
Example 14
Source File: AdvertisingTopology.java    From streaming-benchmarks with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    String ad_id = tuple.getStringByField("ad_id");
    String campaign_id = this.redisAdCampaignCache.execute(ad_id);
    if(campaign_id == null) {
        _collector.fail(tuple);
        return;
    }
    _collector.emit(tuple, new Values(campaign_id,
                                      tuple.getStringByField("ad_id"),
                                      tuple.getStringByField("event_time")));
    _collector.ack(tuple);
}
 
Example 15
Source File: DummyIndexer.java    From storm-crawler with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    _collector.emit(
            com.digitalpebble.stormcrawler.Constants.StatusStreamName,
            tuple, new Values(url, metadata, Status.FETCHED));
    _collector.ack(tuple);
}
 
Example 16
Source File: FeedDetectorBolt.java    From news-crawl with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");

    boolean isFeed = Boolean.valueOf(metadata.getFirstValue(isFeedKey));

    if (!isFeed) {
        String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
        if (ct != null) {
            for (String clue : mimeTypeClues) {
                if (ct.contains(clue)) {
                    isFeed = true;
                    metadata.setValue(isFeedKey, "true");
                    LOG.info("Feed detected from content type <{}> for {}",
                            ct, url);
                    break;
                }
            }
        }
    }

    if (!isFeed) {
        if (contentDetector.matches(content)) {
            isFeed = true;
            metadata.setValue(isFeedKey, "true");
            LOG.info("Feed detected from content: {}", url);
        }
    }

    if (isFeed) {
        // do not parse but run parse filters
        ParseResult parse = new ParseResult();
        ParseData parseData = parse.get(url);
        parseData.setMetadata(metadata);
        parseFilters.filter(url, content, null, parse);
        // emit status
        collector.emit(Constants.StatusStreamName, tuple,
                new Values(url, metadata, Status.FETCHED));
    } else {
        // pass on
        collector.emit(tuple, tuple.getValues());
    }
    collector.ack(tuple);
}
 
Example 17
Source File: NewsSiteMapDetectorBolt.java    From news-crawl with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Tuple tuple) {
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");

    boolean isSitemap = Boolean.valueOf(
            metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
    boolean isNewsSitemap = Boolean.valueOf(
            metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));

    if (!isNewsSitemap || !isSitemap) {
        int match = contentDetector.getFirstMatch(content);
        if (match >= 0) {
            // a sitemap, not necessarily a news sitemap
            isSitemap = true;
            metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
            if (match <= NewsSiteMapParserBolt.contentCluesSitemapNewsMatchUpTo) {
                isNewsSitemap = true;
                LOG.info("{} detected as news sitemap based on content",
                        url);
                metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey,
                        "true");
            }
        }
    }

    if (isSitemap) {
        // do not parse but run parse filters
        ParseResult parse = new ParseResult();
        ParseData parseData = parse.get(url);
        parseData.setMetadata(metadata);
        parseFilters.filter(url, content, null, parse);
        // emit status
        collector.emit(Constants.StatusStreamName, tuple,
                new Values(url, metadata, Status.FETCHED));
    } else {
        // pass on
        collector.emit(tuple, tuple.getValues());
    }
    collector.ack(tuple);
}
 
Example 18
Source File: IndexerBolt.java    From storm-crawler with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Tuple tuple) {

    String url = tuple.getStringByField("url");

    Metadata metadata = (Metadata) tuple.getValueByField("metadata");
    String text = tuple.getStringByField("text");

    boolean keep = filterDocument(metadata);
    if (!keep) {
        eventCounter.scope("Filtered").incrBy(1);
        // treat it as successfully processed even if
        // we do not index it
        _collector.emit(StatusStreamName, tuple, new Values(url, metadata,
                Status.FETCHED));
        _collector.ack(tuple);
        return;
    }

    try {
        SolrInputDocument doc = new SolrInputDocument();

        // index text content
        if (fieldNameForText() != null) {
            doc.addField(fieldNameForText(), trimText(text));
        }

        // url
        if (fieldNameForURL() != null) {
            // Distinguish the value used for indexing
            // from the one used for the status
            String normalisedurl = valueForURL(tuple);
            doc.addField(fieldNameForURL(), normalisedurl);
        }

        // select which metadata to index
        Map<String, String[]> keyVals = filterMetadata(metadata);

        Iterator<String> iterator = keyVals.keySet().iterator();
        while (iterator.hasNext()) {
            String fieldName = iterator.next();
            String[] values = keyVals.get(fieldName);
            for (String value : values) {
                doc.addField(fieldName, value);
            }
        }

        connection.getClient().add(doc);

        eventCounter.scope("Indexed").incrBy(1);

        _collector.emit(StatusStreamName, tuple, new Values(url, metadata,
                Status.FETCHED));
        _collector.ack(tuple);

    } catch (Exception e) {
        LOG.error("Send update request to SOLR failed due to {}", e);
        _collector.fail(tuple);
    }
}
 
Example 19
Source File: StatusUpdaterBolt.java    From storm-crawler with Apache License 2.0 4 votes vote down vote up
@Override
public void afterBulk(long executionId, BulkRequest request,
        BulkResponse response) {
    LOG.debug("afterBulk [{}] with {} responses", executionId,
            request.numberOfActions());
    long msec = response.getTook().getMillis();
    eventCounter.scope("bulks_received").incrBy(1);
    eventCounter.scope("bulk_msec").incrBy(msec);
    Iterator<BulkItemResponse> bulkitemiterator = response.iterator();
    int itemcount = 0;
    int acked = 0;
    int failurecount = 0;

    synchronized (waitAck) {
        while (bulkitemiterator.hasNext()) {
            BulkItemResponse bir = bulkitemiterator.next();
            itemcount++;
            String id = bir.getId();
            BulkItemResponse.Failure f = bir.getFailure();
            boolean failed = false;
            if (f != null) {
                // already discovered
                if (f.getStatus().equals(RestStatus.CONFLICT)) {
                    eventCounter.scope("doc_conflicts").incrBy(1);
                    LOG.debug("Doc conflict ID {}", id);
                } else {
                    LOG.error("Update ID {}, failure: {}", id, f);
                    failed = true;
                }
            }
            List<Tuple> xx = waitAck.getIfPresent(id);
            if (xx != null) {
                LOG.debug("Acked {} tuple(s) for ID {}", xx.size(), id);
                for (Tuple x : xx) {
                    if (!failed) {
                        String url = x.getStringByField("url");
                        acked++;
                        // ack and put in cache
                        LOG.debug("Acked {} with ID {}", url, id);
                        super.ack(x, url);
                    } else {
                        failurecount++;
                        _collector.fail(x);
                    }
                }
                waitAck.invalidate(id);
            } else {
                LOG.warn("Could not find unacked tuple for {}", id);
            }
        }

        LOG.info(
                "Bulk response [{}] : items {}, waitAck {}, acked {}, failed {}",
                executionId, itemcount, waitAck.size(), acked, failurecount);
        if (waitAck.size() > 0 && LOG.isDebugEnabled()) {
            for (String kinaw : waitAck.asMap().keySet()) {
                LOG.debug(
                        "Still in wait ack after bulk response [{}] => {}",
                        executionId, kinaw);
            }
        }
    }
}
 
Example 20
Source File: IndexerBolt.java    From storm-crawler with Apache License 2.0 4 votes vote down vote up
@Override
public void execute(Tuple tuple) {

    String url = tuple.getStringByField("url");

    // Distinguish the value used for indexing
    // from the one used for the status
    String normalisedurl = valueForURL(tuple);

    LOG.info("Indexing {} as {}", url, normalisedurl);

    Metadata metadata = (Metadata) tuple.getValueByField("metadata");
    String text = tuple.getStringByField("text");

    boolean keep = filterDocument(metadata);
    if (!keep) {
        LOG.info("Filtered {}", url);
        eventCounter.scope("Filtered").incrBy(1);
        // treat it as successfully processed even if
        // we do not index it
        _collector.emit(StatusStreamName, tuple, new Values(url, metadata,
                Status.FETCHED));
        _collector.ack(tuple);
        return;
    }

    String docID = org.apache.commons.codec.digest.DigestUtils
            .sha256Hex(normalisedurl);

    try {
        XContentBuilder builder = jsonBuilder().startObject();

        // display text of the document?
        if (fieldNameForText() != null) {
            builder.field(fieldNameForText(), trimText(text));
        }

        // send URL as field?
        if (fieldNameForURL() != null) {
            builder.field(fieldNameForURL(), normalisedurl);
        }

        // which metadata to display?
        Map<String, String[]> keyVals = filterMetadata(metadata);

        Iterator<String> iterator = keyVals.keySet().iterator();
        while (iterator.hasNext()) {
            String fieldName = iterator.next();
            String[] values = keyVals.get(fieldName);
            if (values.length == 1) {
                builder.field(fieldName, values[0]);
            } else if (values.length > 1) {
                builder.array(fieldName, values);
            }
        }

        builder.endObject();

        String sha256hex = org.apache.commons.codec.digest.DigestUtils
                .sha256Hex(normalisedurl);

        IndexRequest indexRequest = new IndexRequest(getIndexName(metadata))
                .source(builder).id(sha256hex);

        DocWriteRequest.OpType optype = DocWriteRequest.OpType.INDEX;

        if (create) {
            optype = DocWriteRequest.OpType.CREATE;
        }

        indexRequest.opType(optype);

        if (pipeline != null) {
            indexRequest.setPipeline(pipeline);
        }

        connection.getProcessor().add(indexRequest);

        eventCounter.scope("Indexed").incrBy(1);
        perSecMetrics.scope("Indexed").update(1);

        synchronized (waitAck) {
            waitAck.put(docID, tuple);
        }
    } catch (IOException e) {
        LOG.error("Error building document for ES", e);
        // do not send to status stream so that it gets replayed
        _collector.fail(tuple);
        if (docID != null) {
            synchronized (waitAck) {
                waitAck.invalidate(docID);
            }
        }
    }
}