package clusterer;

import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import net.sf.javaml.core.Dataset;
import net.sf.javaml.core.DefaultDataset;
import net.sf.javaml.core.DenseInstance;
import net.sf.javaml.core.Instance;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

@SuppressWarnings("unused")
public class WordFrequency {

	public enum WordsCategoryClasses {
		BODY, THAL
	}

	static List<String> stopWords;
	static List<String> stopKeywords;
	static List<String> textualContentFromTitle;
	static List<String> textualContentFromHeadings;
	static List<String> textualContentFromTables;
	static List<String> textualContentFromLists;
	static List<String> textualContentFromFont;
	static List<String> textualContentFromBody;
	static List<String> textualContentFromAnchors;
	static List<String> textualContentFromThal;
	static Map<String, LinkedHashMap<String, BigDecimal>> wordsBodyFrequenciesMap;
	static Map<String, LinkedHashMap<String, BigDecimal>> wordsThalFrequenciesMap;
	static Dataset dataBody;
	static Dataset dataThal;
	static String directory;

	public WordFrequency(String dir) {
		directory = dir;
		stopWords = new LinkedList<String>();
		textualContentFromTitle = new LinkedList<String>();
		textualContentFromHeadings = new LinkedList<String>();
		textualContentFromTables = null;
		textualContentFromLists = new LinkedList<String>();
		textualContentFromFont = null;
		textualContentFromBody = new LinkedList<String>();
		textualContentFromAnchors = new LinkedList<String>();
		textualContentFromThal = new LinkedList<String>();
		wordsBodyFrequenciesMap = new LinkedHashMap<String, LinkedHashMap<String, BigDecimal>>();
		wordsThalFrequenciesMap = new LinkedHashMap<String, LinkedHashMap<String, BigDecimal>>();
		dataBody = new DefaultDataset();
		dataThal = new DefaultDataset();
	}

	/**
	 * run the words frequencies calculation
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	public void run() throws ParserConfigurationException, SAXException, IOException {

		// PREPROCESSING STEP: gets the stopwords from txt files
		getStopWords();

		// PREPROCESSING STEP: creates tags vector and texts vectors by class
		extractTextualContentFromBody();
		extractTextualContentFromThal();

		// System.out.println(textualContentFromBody);

		// ELABORATION STEP: calculates word frequency maps
		wordsBodyFrequenciesMap = calculateWordsBodyFrequency();
		wordsThalFrequenciesMap = calculateWordsThalFrequency();

		System.out.println("[LOG] words body vector: " + textualContentFromBody.size() + " words");
		System.out.println("[LOG] words thal vector: " + textualContentFromThal.size() + " words");

	}

	/**
	 * get the WordsThal Frequencies map
	 * 
	 * @return the wordsThalFrequenciesMap
	 */
	public Map<String, LinkedHashMap<String, BigDecimal>> getWordsThalFrequenciesMap() {
		return wordsThalFrequenciesMap;
	}

	/**
	 * get the WordsBody Frequencies map
	 * 
	 * @return the wordsBodyFrequenciesMap
	 */
	public Map<String, LinkedHashMap<String, BigDecimal>> getWordsBodyFrequenciesMap() {
		return wordsBodyFrequenciesMap;
	}

	/**
	 * print out the Words Frequencies map
	 * 
	 * @param map
	 */
	private static void printMap(Map<String, LinkedHashMap<String, BigDecimal>> map) {

		System.out.println("Words Vector");
		System.out.print("keys: " + map.size() + ", values: ");

		for (String s : map.keySet()) {
			System.out.println(map.get(s).size());
			break;
		}

		for (String s : map.keySet()) {
			System.out.println(s);
			System.out.println("\t" + map.get(s));
		}

		System.out.println();

	}

	/**
	 * extract textual content from <body>
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromBody() throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("body", textualContentFromBody);

		System.out.println("Textual Content from Body\n" + textualContentFromBody);
		System.out.println("Size: " + textualContentFromBody.size() + "\n");

	}

	/**
	 * extract textual content from titles, headings, anchors, and lists
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromThal() throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("title", textualContentFromThal);

		extractTextualContentFromTag("h1", textualContentFromThal);

		extractTextualContentFromTag("h2", textualContentFromThal);

		extractTextualContentFromTag("h3", textualContentFromThal);

		extractTextualContentFromTag("h4", textualContentFromThal);

		extractTextualContentFromTag("h5", textualContentFromThal);

		extractTextualContentFromTag("h6", textualContentFromThal);

		extractTextualContentFromTag("a", textualContentFromThal);

		extractTextualContentFromTag("li", textualContentFromThal);

		extractTextualContentFromTag("ol", textualContentFromThal);

		extractTextualContentFromTag("ul", textualContentFromThal);

		System.out.println("Textual Content from THAL\n" + textualContentFromThal);
		System.out.println("Size: " + textualContentFromThal.size() + "\n");

	}

	/**
	 * extract textual content from <a>
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromAnchors()
			throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("a", textualContentFromAnchors);

		System.out.println("Textual Content from Anchors\n" + textualContentFromAnchors);
		System.out.println("Size: " + textualContentFromAnchors.size() + "\n");

	}

	/**
	 * extract textual content from <strong>, <b>, <i>, <u>
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromFont() throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("strong", textualContentFromFont);

		extractTextualContentFromTag("b", textualContentFromFont);

		extractTextualContentFromTag("i", textualContentFromFont);

		extractTextualContentFromTag("u", textualContentFromFont);

		// System.out.println("Textual Content from Tables STRONG-B-I-U\n" +
		// textualContentFromFont);
		// System.out.println("Size: " + textualContentFromFont.size()+"\n");
	}

	/**
	 * extract textual content from lists
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromLists()
			throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("li", textualContentFromLists);

		extractTextualContentFromTag("ol", textualContentFromLists);

		extractTextualContentFromTag("ul", textualContentFromLists);

		// System.out.println("Textual Content from Tables LI-OL-UL\n" +
		// textualContentFromLists);
		// System.out.println("Size: " + textualContentFromLists.size()+"\n");
	}

	/**
	 * extract textual content from tables
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromTables()
			throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("table", textualContentFromTables);

		extractTextualContentFromTag("tr", textualContentFromTables);

		extractTextualContentFromTag("td", textualContentFromTables);

		extractTextualContentFromTag("th", textualContentFromTables);

		// System.out.println("Textual Content from Tables TABLE-TR-TD-TH\n"
		// + textualContentFromTables);
		// System.out.println("Size: " + textualContentFromTables.size()+"\n");

	}

	/**
	 * extract textual content from titles
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromTitle()
			throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("title", textualContentFromTitle);

		// System.out.println("Textual Content from Title\n"
		// + textualContentFromTitle);
		// System.out.println("Size: " + textualContentFromTitle.size()+"\n");

	}

	/**
	 * extract textual content from headings
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromHeadings()
			throws ParserConfigurationException, SAXException, IOException {

		extractTextualContentFromTag("h1", textualContentFromHeadings);

		extractTextualContentFromTag("h2", textualContentFromHeadings);

		extractTextualContentFromTag("h3", textualContentFromHeadings);

		extractTextualContentFromTag("h4", textualContentFromHeadings);

		extractTextualContentFromTag("h5", textualContentFromHeadings);

		extractTextualContentFromTag("h6", textualContentFromHeadings);

		// System.out.println("Textual Content from Headings H1-H6\n"
		// + textualContentFromHeadings);
		// System.out.println("Size: " + textualContentFromHeadings.size()+"\n");

	}

	/**
	 * reads the stop words
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void getStopWords() throws IOException {

		stopWords = FileUtils.readLines(new File("stopwords_en_lextex.txt"), "utf-8");
		stopKeywords = FileUtils.readLines(new File("stopwords_keywords.txt"), "utf-8");
	}

	/**
	 * extract textual content from a tag
	 * 
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static void extractTextualContentFromTag(String tag, List<String> output)
			throws ParserConfigurationException, SAXException, IOException {

		String domsDirectory = directory;
		File dir = new File(domsDirectory);

		List<File> files = (List<File>) FileUtils.listFiles(dir, FileFilterUtils.suffixFileFilter("html"),
				TrueFileFilter.INSTANCE);

		for (int i = 0; i < files.size(); i++) {

			Document d = createDocument(domsDirectory + files.get(i).getName());
			textScraper(d, d.getElementsByTagName(tag).item(0), output);

		}

	}

	/**
	 * scrape the textual content
	 * 
	 * @param d
	 * @param node
	 * @param visited
	 */
	private static void textScraper(Document d, Node node, List<String> visited) {

		if (node == null) {
			return;
		} else if (node.getTextContent() == null || node.getTextContent().isEmpty()) {
			return;
		} else {

			// System.out.println("Processing... " + node.getNodeName());

			String a = processWord(node.getTextContent());

			String[] splittedText = a.split(" ");

			for (String s : splittedText) {

				// System.out.println("s: " + s);
				// s = processWord(s);
				// System.out.println("s processed: " + s);

				if (s.length() == 0/* || s.equals("\n") || s.equals("\t") */) {
					continue;
				}

				if (!stopWords.contains(s) // s is NOT a stop word
						&& !containedInAnyStopKeywords(s) // s is NOT a keyword of any kind
						&& !visited.contains(s) // s is NOT already present
						&& !isNumeric(s) // s is NOT a numeric value
				// && !(s.contains("\n") || s.contains("\t") || s.contains("\\s") ||
				// s.contains("\\w"))
				) {
					visited.add(s);
				}
			}

			NodeList nl = node.getChildNodes();
			for (int i = 0; i < nl.getLength(); i++) {
				textScraper(d, nl.item(i), visited);
			}
		}

	}

	/**
	 * true if s contains a stop word
	 * 
	 * @param s
	 * @return
	 */
	private static boolean containedInAnyStopKeywords(String s) {

		for (String k : stopKeywords) {
			if (s.contains(k)) {
				return true;
			}
		}

		return false;
	}

	/**
	 * clean the word x
	 * 
	 * @param x
	 * @return
	 */
	private static String processWord(String x) {
		x = x.replace("[", " ");
		x = x.replace("]", " ");
		x = x.replace("-", " ");
		x = x.replace("\n", " ");
		x = x.replace("\t", " ");
		x = x.replaceAll("\\s+", " ");
		x = x.replaceAll("\\W", " ");
		return x.replaceAll("[(){}|*,#'$\".:;!?<>%]", " ").toLowerCase();
	}

	/**
	 * clean the body content
	 * 
	 * @param x
	 * @return
	 */
	private static String processPageContent(String x) {
		x = x.replace("[", " ");
		x = x.replace("]", " ");
		x = x.replace("-", " ");
		x = x.replace("\n", " ");
		x = x.replace("\t", " ");
		x = x.replaceAll("\\s+", " ");
		return x.replaceAll("[(){}|,#'$\".:;!?<>%]", " ").toLowerCase();
	}

	/**
	 * check whether s is a number
	 * 
	 * @param str
	 * @return
	 */
	public static boolean isNumeric(String s) {
		try {
			double d = Double.parseDouble(s);
		} catch (NumberFormatException nfe) {
			return false;
		}
		return true;
	}

	/**
	 * calculate <body> words frequencies
	 * 
	 * @return
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static Map<String, LinkedHashMap<String, BigDecimal>> calculateWordsBodyFrequency()
			throws ParserConfigurationException, SAXException, IOException {

		String domsDirectory = directory;
		File dir = new File(domsDirectory);

		LinkedHashMap<String, BigDecimal> wordsFrequencyMap = new LinkedHashMap<String, BigDecimal>();
		Map<String, LinkedHashMap<String, BigDecimal>> wordsMap = new LinkedHashMap<String, LinkedHashMap<String, BigDecimal>>();

		List<File> files = (List<File>) FileUtils.listFiles(dir, FileFilterUtils.suffixFileFilter("html"),
				TrueFileFilter.INSTANCE);

		for (int i = 0; i < files.size(); i++) {

			String page = files.get(i).getName();
			wordsFrequencyMap = new LinkedHashMap<String, BigDecimal>();

			// BigDecimal sum = new BigDecimal(0.0);

			for (String t : textualContentFromBody) {
				BigDecimal f = wordFrequency(page, t);
				wordsFrequencyMap.put(t, f);
				// sum = sum.add(f);
			}
			// System.out.println(String.format("%03.8f", sum));
			wordsMap.put(page, wordsFrequencyMap);

		}

		return wordsMap;

	}

	/**
	 * calculate <title> <headings> <a> <lists> words frequencies
	 * 
	 * @return
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static Map<String, LinkedHashMap<String, BigDecimal>> calculateWordsThalFrequency()
			throws ParserConfigurationException, SAXException, IOException {

		String domsDirectory = directory;
		File dir = new File(domsDirectory);

		LinkedHashMap<String, BigDecimal> wordsFrequencyMap = new LinkedHashMap<String, BigDecimal>();
		Map<String, LinkedHashMap<String, BigDecimal>> wordsMap = new LinkedHashMap<String, LinkedHashMap<String, BigDecimal>>();

		List<File> files = (List<File>) FileUtils.listFiles(dir, FileFilterUtils.suffixFileFilter("html"),
				TrueFileFilter.INSTANCE);

		for (int i = 0; i < files.size(); i++) {

			String page = files.get(i).getName();
			wordsFrequencyMap = new LinkedHashMap<String, BigDecimal>();

			for (String t : textualContentFromThal) {
				BigDecimal f = wordFrequency(page, t);
				wordsFrequencyMap.put(t, f);
			}

			wordsMap.put(page, wordsFrequencyMap);

		}

		return wordsMap;

	}

	/**
	 * calculate the frequency of word in page
	 */
	private static BigDecimal wordFrequency(String page, String word)
			throws ParserConfigurationException, SAXException, IOException {

		Document d = createDocument(directory + page);

		Node body = d.getElementsByTagName("html").item(0);
		String fullText = body.getTextContent();
		fullText = processPageContent(fullText);

		double wordCardinality = StringUtils.countMatches(fullText, word);
		double total = fullText.split(" ").length;
		BigDecimal frequency = new BigDecimal(wordCardinality * 100 / total);

		return frequency;
	}

	/**
	 * auxiliary function
	 * 
	 * @param name
	 * @return
	 * @throws ParserConfigurationException
	 * @throws SAXException
	 * @throws IOException
	 */
	private static Document createDocument(String name) throws ParserConfigurationException, SAXException, IOException {

		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
		Document doc = docBuilder.parse(name);

		return doc;
	}

	/**
	 * create the dataset for body frequencies
	 * 
	 * @return
	 */
	public Dataset createDatasetBody() {

		for (String k : wordsBodyFrequenciesMap.keySet()) {

			Collection<BigDecimal> v = wordsBodyFrequenciesMap.get(k).values();
			double[] features = new double[v.size()];
			int count = 0;

			for (BigDecimal bd : v) {
				features[count] = bd.doubleValue();
				count++;
			}

			Instance instance = new DenseInstance(features, k);
			dataBody.add(instance);

		}

		return dataBody;

	}

	/**
	 * create the dataset for thal frequencies
	 * 
	 * @return
	 */
	public Dataset createDatasetThal() {

		for (String k : wordsThalFrequenciesMap.keySet()) {

			Collection<BigDecimal> v = wordsThalFrequenciesMap.get(k).values();
			double[] features = new double[v.size()];
			int count = 0;

			for (BigDecimal bd : v) {
				features[count] = bd.doubleValue();
				count++;
			}

			Instance instance = new DenseInstance(features, k);
			dataThal.add(instance);

		}

		return dataThal;

	}

}