* Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package com.digitalpebble.storm.crawler.elasticsearch.persistence;

import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;

import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.elasticsearch.ElasticSearchConnection;
import com.digitalpebble.storm.crawler.util.ConfUtils;
import com.digitalpebble.storm.crawler.util.URLPartitioner;

 * Overly simplistic spout implementation which pulls URL from an ES index.
 * Doesn't do anything about data locality or sharding.
 * **/
public class ElasticSearchSpout extends BaseRichSpout {

    private static final Logger LOG = LoggerFactory

    private static final String ESBoltType = "status";

    private static final String ESStatusIndexNameParamName = "es.status.index.name";
    private static final String ESStatusDocTypeParamName = "es.status.doc.type";
    private static final String ESStatusMaxInflightParamName = "es.status.max.inflight.urls.per.bucket";

    private String indexName;
    private String docType;

    private SpoutOutputCollector _collector;

    private Client client;

    private final int bufferSize = 100;

    private Queue<Values> buffer = new LinkedList<Values>();

    private int lastStartOffset = 0;

    private URLPartitioner partitioner;

    private int maxInFlightURLsPerBucket = -1;

    /** Keeps a count of the URLs being processed per host/domain/IP **/
    private Map<String, Integer> inFlightTracker = new HashMap<String, Integer>();

    // URL / politeness bucket (hostname / domain etc...)
    private Map<String, String> beingProcessed = new HashMap<String, String>();

    public void open(Map stormConf, TopologyContext context,
            SpoutOutputCollector collector) {
        indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName,
        docType = ConfUtils.getString(stormConf, ESStatusDocTypeParamName,
        maxInFlightURLsPerBucket = ConfUtils.getInt(stormConf,
                ESStatusMaxInflightParamName, 1);
        try {
            client = ElasticSearchConnection.getClient(stormConf, ESBoltType);
        } catch (Exception e1) {
            LOG.error("Can't connect to ElasticSearch", e1);
            throw new RuntimeException(e1);

        partitioner = new URLPartitioner();

        _collector = collector;

    public void close() {
        if (client != null)

    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("url", "metadata"));

    public void nextTuple() {
        // have anything in the buffer?
        if (!buffer.isEmpty()) {
            Values fields = buffer.remove();
            String url = fields.get(0).toString();
            Metadata metadata = (Metadata) fields.get(1);

            String partitionKey = partitioner.getPartition(url, metadata);

            // check whether we already have too tuples in flight for this
            // partition key

            if (maxInFlightURLsPerBucket != -1) {
                Integer inflightforthiskey = inFlightTracker.get(partitionKey);
                if (inflightforthiskey == null)
                    inflightforthiskey = new Integer(0);
                if (inflightforthiskey.intValue() >= maxInFlightURLsPerBucket) {
                    // do it later! left it out of the queue for now
                int currentCount = inflightforthiskey.intValue();
                inFlightTracker.put(partitionKey, ++currentCount);

            beingProcessed.put(url, partitionKey);

            this._collector.emit(fields, url);
        // re-populate the buffer

    /** run a query on ES to populate the internal buffer **/
    private void populateBuffer() {
        // TODO cap the number of results per shard
        // assuming that the sharding of status URLs is done
        // based on the hostname domain or anything else
        // which is useful for politeness

        // TODO cap the results per host or domain

        Date now = new Date();

        // TODO use scrolls instead?
        // @see
        // http://www.elasticsearch.org/guide/en/elasticsearch/client/java-api/current/search.html#scrolling
        SearchResponse response = client
                // .setPostFilter(
                // FilterBuilders.rangeFilter("age").from(12).to(18))

        SearchHits hits = response.getHits();
        int numhits = hits.getHits().length;

        // no more results?
        if (numhits == 0)
            lastStartOffset = 0;
            lastStartOffset += numhits;

        // filter results so that we don't include URLs we are already
        // being processed or skip those for which we already have enough
        for (int i = 0; i < hits.getHits().length; i++) {
            Map<String, Object> keyValues = hits.getHits()[i].sourceAsMap();
            String url = (String) keyValues.get("url");

            // is already being processed - skip it!
            if (beingProcessed.containsKey(url))

            String mdAsString = (String) keyValues.get("metadata");
            Metadata metadata = new Metadata();
            if (mdAsString != null) {
                // parse the string and generate the MD accordingly
                // url.path: http://www.lemonde.fr/
                // depth: 1
                String[] kvs = mdAsString.split("\n");
                for (String pair : kvs) {
                    String[] kv = pair.split(": ");
                    if (kv.length != 2) {
                        LOG.info("Invalid key value pair {}", pair);
                    metadata.addValue(kv[0], kv[1]);
            buffer.add(new Values(url, metadata));

    public void ack(Object msgId) {
        String partitionKey = beingProcessed.remove(msgId);

    public void fail(Object msgId) {
        String partitionKey = beingProcessed.remove(msgId);

    private void decrementPartitionKey(String partitionKey) {
        if (partitionKey == null)
        Integer currentValue = this.inFlightTracker.get(partitionKey);
        if (currentValue == null)
        int currentVal = currentValue.intValue();
        this.inFlightTracker.put(partitionKey, currentVal);