import static java.lang.System.out;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Scanner;
import java.util.Set;
import java.util.TreeSet;

import com.aliasi.tokenizer.EnglishStopTokenizerFactory;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

public class SimpleStringCleaning {

	public static void main(String[] args) {

		String dirtyText = "Call me Ishmael. Some years ago- never mind how";
		dirtyText += " long precisely - having little or no money in my purse,";
		dirtyText += " and nothing particular to interest me on shore, I thought"; 
		dirtyText += " I would sail about a little and see the watery part of the world.";
		
		//Example 1 - basic clean (regex & String class methods)
		//simpleClean(dirtyText);

		//Example 2 - clean and put in array (split)
		//simpleCleanToArray(dirtyText);

		//Example 3 - join
		//cleanAndJoin(dirtyText);

		//Example 4 - simple remove stop words 
		//removeStopWords(dirtyText);

		//Example 5 - remove stop words with removeAll
		//removeStopWordsRemoveAll(dirtyText);

		//Example 6 - remove stop words with LingPipe
		removeStopWithLing(dirtyText);



	}

	public static String simpleClean(String text){

		out.println("Dirty text: " + text);
		text = text.toLowerCase();
		//explain what each part of this regex does
		text = text.replaceAll("[\\d[^\\w\\s]]+", " ");
		//NOTE trim only works on leading/trailing spaces
		text = text.trim();
		//is this the best way to do this? This isn't great - talk about it even?
		while(text.contains("  ")){
			text = text.replaceAll("  ", " ");
		}		
		out.println("Cleaned text: " + text);
		return text;
	}

	public static String[] simpleCleanToArray(String text){
		out.println("Dirty text: " + text);
		text = text.replaceAll("[\\d[^\\w\\s]]+", "");
		String[] cleanText = text.toLowerCase().trim().split("[\\W\\d]+");
		out.print("Cleaned text: ");
		for(String clean : cleanText){
			out.print(clean + " ");
		}
		out.println();
		return cleanText;
	}

	public static String cleanAndJoin(String text){
		out.println("Dirty text: " + text);
		String[] words = text.toLowerCase().trim().split("[\\W\\d]+");
		String cleanText = String.join(" ", words);
		out.println("Cleaned text: " + cleanText);
		return cleanText;
	}

	public static void removeStopWords(String text){
		//discuss stop words file - how to choose stop words? use whole alphabet as way to handle I'M --> I M

		//****************** SIMPLE EXAMPLE *******************************************************************************************

		try {
			//read in list of stop words
			Scanner readStop = new Scanner(new File("C://Jenn Personal//Packt Data Science//Chapter 3 Data Cleaning//stopwords.txt"));
			//create an ArrayList to hold dirty text - call simpleCleanToArray to perform basic cleaning and put in array first
			ArrayList<String> words = new ArrayList<String>(Arrays.asList(simpleCleanToArray(text)));
			//loop through stop words file and check array for each word
			out.println("Original clean text: " + words.toString());
			ArrayList<String> foundWords = new ArrayList();
			while(readStop.hasNextLine()){
				String stopWord = readStop.nextLine().toLowerCase();
				if(words.contains(stopWord)){
					foundWords.add(stopWord);
				}
			}
			words.removeAll(foundWords);
			out.println("Text without stop words: " + words.toString());
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}

	public static void removeStopWordsRemoveAll(String text){
		//******************EXAMPLE WITH REMOVE ALL *******************************************************************************************

		try {
			out.println(text);
			Scanner stopWordList = new Scanner(new File("C://Jenn Personal//Packt Data Science//Chapter 3 Data Cleaning//stopwords.txt"));
			TreeSet<String> stopWords = new TreeSet<String>();
			while(stopWordList.hasNextLine()){
				stopWords.add(stopWordList.nextLine());
			}
			ArrayList<String> dirtyText = new ArrayList<String>(Arrays.asList(text.split(" ")));
			dirtyText.removeAll(stopWords);
			out.println("Clean words: ");
			for(String x : dirtyText){
				out.print(x + " ");
			}
			out.println();
			stopWordList.close();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public static void removeStopWithLing(String text){
		//******************EXAMPLE WITH ling pipe *******************************************************************************************
		//mention lower vs upper case
		out.println(text);
		text = text.toLowerCase().trim();
		TokenizerFactory fact = IndoEuropeanTokenizerFactory.INSTANCE;
		fact = new EnglishStopTokenizerFactory(fact);
		Tokenizer tok = fact.tokenizer(text.toCharArray(), 0, text.length());
		for(String word : tok){
			out.print(word + " ");
		}
	}
}