/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.digitalpebble.stormcrawler.parse;

import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import com.digitalpebble.stormcrawler.util.ConfUtils;

/**
 * Filters the text extracted from HTML documents, used by JSoupParserBolt.
 * Configured with optional inclusion patterns based on <a
 * href="https://jsoup.org/cookbook/extracting-data/selector-syntax"> JSoup
 * selectors</a>, as well as a list of tags to be excluded.
 * 
 * Replaces {@link ContentFilter}.
 * 
 * The first matching inclusion pattern is used or the whole document if no
 * expressions are configured or no match has been found.
 * 
 * The TextExtraction can be configured as so:
 * 
 * <pre>
 * {@code 
 *   textextractor.include.pattern:
 *    - DIV[id="maincontent"]
 *    - DIV[itemprop="articleBody"]
 *    - ARTICLE
 * 
 *   textextractor.exclude.tags:
 *    - STYLE
 *    - SCRIPT
 *    }
 * </pre>
 * 
 * @since 1.13
 **/
public class TextExtractor {

    public final static String INCLUDE_PARAM_NAME = "textextractor.include.pattern";
    public final static String EXCLUDE_PARAM_NAME = "textextractor.exclude.tags";

    private List<String> inclusionPatterns;
    private HashSet<String> excludedTags;

    public TextExtractor(Map stormConf) {
        inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME,
                stormConf);
        excludedTags = new HashSet<String>();
        ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
                .forEach((s) -> excludedTags.add(s.toLowerCase()));
    }

    public String text(Element element) {
        // no patterns at all - return the text from the whole document
        if (inclusionPatterns.size() == 0 && excludedTags.size() == 0) {
            return _text(element);
        }

        Elements matches = new Elements();

        for (String pattern : inclusionPatterns) {
            matches = element.select(pattern);
            if (!matches.isEmpty())
                break;
        }

        // if nothing matches or no patterns were defined use the whole doc
        if (matches.isEmpty()) {
            matches.add(element);
        }

        final StringBuilder accum = new StringBuilder();

        for (Element node : matches) {
            accum.append(_text(node)).append("\n");
        }

        return accum.toString().trim();
    }

    private String _text(Node node) {
        final StringBuilder accum = new StringBuilder();
        NodeTraversor.traverse(new NodeVisitor() {

            private Node excluded = null;

            public void head(Node node, int depth) {
                if (excluded == null && node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    appendNormalisedText(accum, textNode);
                } else if (node instanceof Element) {
                    Element element = (Element) node;
                    if (excludedTags.contains(element.tagName())) {
                        excluded = element;
                    }
                    if (accum.length() > 0
                            && (element.isBlock() || element.tag().getName()
                                    .equals("br"))
                            && !lastCharIsWhitespace(accum))
                        accum.append(' ');
                }
            }

            public void tail(Node node, int depth) {
                // make sure there is a space between block tags and immediately
                // following text nodes <div>One</div>Two should be "One Two".
                if (node instanceof Element) {
                    Element element = (Element) node;
                    if (element == excluded) {
                        excluded = null;
                    }
                    if (element.isBlock()
                            && (node.nextSibling() instanceof TextNode)
                            && !lastCharIsWhitespace(accum))
                        accum.append(' ');
                }

            }
        }, node);
        return accum.toString().trim();
    }

    private static void appendNormalisedText(StringBuilder accum,
            TextNode textNode) {
        String text = textNode.getWholeText();

        if (preserveWhitespace(textNode.parent())
                || textNode instanceof CDataNode)
            accum.append(text);
        else
            StringUtil.appendNormalisedWhitespace(accum, text,
                    lastCharIsWhitespace(accum));
    }

    static boolean preserveWhitespace(Node node) {
        // looks only at this element and five levels up, to prevent recursion &
        // needless stack searches
        if (node != null && node instanceof Element) {
            Element el = (Element) node;
            int i = 0;
            do {
                if (el.tag().preserveWhitespace())
                    return true;
                el = el.parent();
                i++;
            } while (i < 6 && el != null);
        }
        return false;
    }

    static boolean lastCharIsWhitespace(StringBuilder sb) {
        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
    }

}