/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.storm.crawler.elasticsearch.persistence;

import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;

import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.elasticsearch.ElasticSearchConnection;
import com.digitalpebble.storm.crawler.util.ConfUtils;
import com.digitalpebble.storm.crawler.util.URLPartitioner;

/**
 * Overly simplistic spout implementation which pulls URL from an ES index.
 * Doesn't do anything about data locality or sharding.
 * **/
public class ElasticSearchSpout extends BaseRichSpout {

    private static final Logger LOG = LoggerFactory
            .getLogger(ElasticSearchSpout.class);

    private static final String ESBoltType = "status";

    private static final String ESStatusIndexNameParamName = "es.status.index.name";
    private static final String ESStatusDocTypeParamName = "es.status.doc.type";
    private static final String ESStatusMaxInflightParamName = "es.status.max.inflight.urls.per.bucket";

    private String indexName;
    private String docType;

    private SpoutOutputCollector _collector;

    private Client client;

    private final int bufferSize = 100;

    private Queue<Values> buffer = new LinkedList<Values>();

    private int lastStartOffset = 0;

    private URLPartitioner partitioner;

    private int maxInFlightURLsPerBucket = -1;

    /** Keeps a count of the URLs being processed per host/domain/IP **/
    private Map<String, Integer> inFlightTracker = new HashMap<String, Integer>();

    // URL / politeness bucket (hostname / domain etc...)
    private Map<String, String> beingProcessed = new HashMap<String, String>();

    @Override
    public void open(Map stormConf, TopologyContext context,
            SpoutOutputCollector collector) {
        indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName,
                "status");
        docType = ConfUtils.getString(stormConf, ESStatusDocTypeParamName,
                "status");
        maxInFlightURLsPerBucket = ConfUtils.getInt(stormConf,
                ESStatusMaxInflightParamName, 1);
        try {
            client = ElasticSearchConnection.getClient(stormConf, ESBoltType);
        } catch (Exception e1) {
            LOG.error("Can't connect to ElasticSearch", e1);
            throw new RuntimeException(e1);
        }

        partitioner = new URLPartitioner();
        partitioner.configure(stormConf);

        _collector = collector;
    }

    @Override
    public void close() {
        if (client != null)
            client.close();
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("url", "metadata"));
    }

    @Override
    public void nextTuple() {
        // have anything in the buffer?
        if (!buffer.isEmpty()) {
            Values fields = buffer.remove();
            String url = fields.get(0).toString();
            Metadata metadata = (Metadata) fields.get(1);

            String partitionKey = partitioner.getPartition(url, metadata);

            // check whether we already have too tuples in flight for this
            // partition key

            if (maxInFlightURLsPerBucket != -1) {
                Integer inflightforthiskey = inFlightTracker.get(partitionKey);
                if (inflightforthiskey == null)
                    inflightforthiskey = new Integer(0);
                if (inflightforthiskey.intValue() >= maxInFlightURLsPerBucket) {
                    // do it later! left it out of the queue for now
                    return;
                }
                int currentCount = inflightforthiskey.intValue();
                inFlightTracker.put(partitionKey, ++currentCount);
            }

            beingProcessed.put(url, partitionKey);

            this._collector.emit(fields, url);
            return;
        }
        // re-populate the buffer
        populateBuffer();
    }

    /** run a query on ES to populate the internal buffer **/
    private void populateBuffer() {
        // TODO cap the number of results per shard
        // assuming that the sharding of status URLs is done
        // based on the hostname domain or anything else
        // which is useful for politeness

        // TODO cap the results per host or domain

        Date now = new Date();

        // TODO use scrolls instead?
        // @see
        // http://www.elasticsearch.org/guide/en/elasticsearch/client/java-api/current/search.html#scrolling
        SearchResponse response = client
                .prepareSearch(indexName)
                .setTypes(docType)
                .setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
                .setQuery(QueryBuilders.rangeQuery("nextFetchDate").lte(now))
                // .setPostFilter(
                // FilterBuilders.rangeFilter("age").from(12).to(18))
                .setFrom(lastStartOffset).setSize(this.bufferSize)
                .setExplain(false).execute().actionGet();

        SearchHits hits = response.getHits();
        int numhits = hits.getHits().length;

        // no more results?
        if (numhits == 0)
            lastStartOffset = 0;
        else
            lastStartOffset += numhits;

        // filter results so that we don't include URLs we are already
        // being processed or skip those for which we already have enough
        //
        for (int i = 0; i < hits.getHits().length; i++) {
            Map<String, Object> keyValues = hits.getHits()[i].sourceAsMap();
            String url = (String) keyValues.get("url");

            // is already being processed - skip it!
            if (beingProcessed.containsKey(url))
                continue;

            String mdAsString = (String) keyValues.get("metadata");
            Metadata metadata = new Metadata();
            if (mdAsString != null) {
                // parse the string and generate the MD accordingly
                // url.path: http://www.lemonde.fr/
                // depth: 1
                String[] kvs = mdAsString.split("\n");
                for (String pair : kvs) {
                    String[] kv = pair.split(": ");
                    if (kv.length != 2) {
                        LOG.info("Invalid key value pair {}", pair);
                        continue;
                    }
                    metadata.addValue(kv[0], kv[1]);
                }
            }
            buffer.add(new Values(url, metadata));
        }
    }

    @Override
    public void ack(Object msgId) {
        super.ack(msgId);
        String partitionKey = beingProcessed.remove(msgId);
        decrementPartitionKey(partitionKey);
    }

    @Override
    public void fail(Object msgId) {
        super.fail(msgId);
        String partitionKey = beingProcessed.remove(msgId);
        decrementPartitionKey(partitionKey);
    }

    private void decrementPartitionKey(String partitionKey) {
        if (partitionKey == null)
            return;
        Integer currentValue = this.inFlightTracker.get(partitionKey);
        if (currentValue == null)
            return;
        int currentVal = currentValue.intValue();
        currentVal--;
        this.inFlightTracker.put(partitionKey, currentVal);
    }

}