/*
 * Copyright 2016
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universit├Ąt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl;

import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;

import java.util.LinkedList;
import java.util.Set;

/**
 * Data structure representing one block of text in HTML
 *
 * Based on https://github.com/duongphuhiep/justext/ by Duong Phu-Hiep
 *
 * @author Duong Phu-Hiep
 * @author Omnia Zayed
 */
public class Paragraph
        extends LinkedList<Node>
{

    //    private ArrayList<String> textNodes;
    int charsCountInLinks = 0;
    private String classType = "";
    private String contextFreeClass = "";
    private String tagName = "";
    private String rawText = "";

    public Paragraph(Node firstNode)
    {
        add(firstNode);
    }

    public void initRawInfo()
    {
        StringBuilder sb = new StringBuilder();
        for (Node n : this) {
            //            NodeHelper.cleanEmptyElements(n);
            if (n instanceof TextNode) {
                this.setTagName(getPath(n));
                String nodeRawText = ((TextNode) n).text();
                sb.append(Utils.normalizeBreaks(nodeRawText).trim());

                if (NodeHelper.isLink(n)) {
                    charsCountInLinks += nodeRawText.length();
                }
            }
        }

        rawText = sb.toString();
    }

    public int getLinksLength()
    {
        return this.charsCountInLinks;
    }

    public String getClassType()
    {
        return this.classType;
    }

    public void setClassType(String classType)
    {
        this.classType = classType;
    }

    public String getContextFreeClass()
    {
        return this.contextFreeClass;
    }

    public void setContextFreeClass(String contextFreeClass)
    {
        this.contextFreeClass = contextFreeClass;
    }

    public String getTagName()
    {
        return this.tagName;
    }

    public String getPath(Node n)
    {
        String nodePath = "";
        while (n != null) {
            if (n instanceof TextNode) {
                n = n.parent();
            }
            if (NodeHelper.isInnerText(n)) {
                n = n.parent();
            }
            String parentNodeName = n.nodeName();
            nodePath = parentNodeName + "." + nodePath;

            if (!parentNodeName.equalsIgnoreCase("html")) {
                n = n.parent();
            }
            else {
                break;
            }
        }

        return nodePath;
    }

    public void setTagName(String name)
    {
        this.tagName = name;
    }

    public boolean isHeading()
    {
        return this.getTagName().matches(".*\\.h\\d\\.");
    }

    public boolean isBoilerplate()
    {
        return !this.getClassType().equalsIgnoreCase("good");
    }

    public String getRawText()
    {

        return Utils.normalizeBreaks(rawText.trim());
    }

    public void setRawText(String rawText)
    {
        this.rawText = Utils.normalizeBreaks(rawText.trim());

    }

    public int getWordsCount()
    {
        return this.getRawText().split("\\s+").length;
    }

    public int stopwordsCount(Set<String> stopwords)
    {
        int count = 0;

        for (String word : this.getRawText().split("\\s+")) {
            if (stopwords.contains(word.toLowerCase())) {
                count += 1;
            }
        }
        return count;
    }

    public float stopwords_density(Set<String> stopwords)
    {
        int wordsCount = this.getWordsCount();
        if (wordsCount == 0) {
            return 0;
        }

        return this.stopwordsCount(stopwords) / (float) wordsCount;
    }

    /**
     * Links density is the number of characters of the sentence defining the link
     * divide by the length of the whole paragraph.
     * e.g: hi {@code <a ...>omnia</a>} this is an example
     * link density = 5 (length of omnia) / 26 (paragraph length)
     *
     * @return Links density
     */
    public float calcLinksDensity()
    {
        int textLength = this.getRawText().length();
        if (textLength == 0) {
            return 0;
        }

        return this.getLinksLength() / (float) textLength;
    }

}