java source code of Tokenizer

package vn.vitk.tok;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.spark.Accumulator;
import org.apache.spark.AccumulatorParam;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;

import scala.Tuple2;
import vn.vitk.util.SparkContextFactory;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;

/**
 * @author Phuong LE-HONG
 * <p>
 * Mar 11, 2016, 10:20:06 AM
 * <p>
 * Fast and simple reimplementation of <code>vnTokenizer</code> tool.
 */
public class Tokenizer implements Serializable {
	
	private static final long serialVersionUID = 216079852616487675L;
	
	private transient JavaSparkContext jsc;
	private Lexicon lexicon = new Lexicon();
	private Map<String, Pattern> patterns = new TreeMap<String, Pattern>();
	private TextNormalizationFunction normalizationFunction = new TextNormalizationFunction();
	private PhraseGraph graph = new PhraseGraph();
	private Bigrams bigram = null;
	private WhitespaceClassifier classifier = null;
	private Accumulator<List<WhitespaceContext>> contexts;
	private PipelineModel model = null;
	private Broadcast<Row[]> prediction = null;
	private int counter = 0;
	
	private boolean verbose = false;
	
	/**
	 * Creates a Vietnamese tokenizer.
	 * @param master
	 * @param lexiconFileName
	 * @param regexpFileName
	 */
	public Tokenizer(String master, String lexiconFileName, String regexpFileName) {
		jsc = SparkContextFactory.create(master);
		lexicon = new Lexicon().load(lexiconFileName);
		if (verbose) 
			System.out.println("#(nodes of the lexicon) = " + lexicon.numNodes());
		
		List<String> lines = jsc.textFile(regexpFileName).collect();
		for (String line : lines) {
			line = line.trim();
			if (!line.startsWith("#")) { // ignore comment lines
				String[] s = line.split("\\s+");
				if (s.length == 2) {
					patterns.put(s[0], Pattern.compile(s[1]));
				}
			}
		}
	}
	
	/**
	 * Creates a Vietnamese tokenizer.
	 * @param master
	 * @param lexiconFileName
	 * @param regexpFileName
	 * @param bigramFileName
	 */
	public Tokenizer(String master, String lexiconFileName, String regexpFileName, String bigramFileName) {
		this(master, lexiconFileName, regexpFileName);
		bigram = new Bigrams(bigramFileName);
	}

	public Tokenizer(String master, String lexiconFileName, String regexpFileName, String whitespaceModelFileName, boolean lr) {
		this(master, lexiconFileName, regexpFileName);
		classifier = new WhitespaceClassifier(lexicon, patterns);
		model = classifier.load(whitespaceModelFileName);
		contexts = jsc.accumulator(new LinkedList<WhitespaceContext>(), new WhitespaceContextAccumulatorParam());
	}
	
	/**
	 * Reads the content of a text file to get lines.
	 * @param fileName
	 * @return a RDD of text lines.
	 */
	public JavaRDD<String> readTextFile(String fileName) {
		JavaRDD<String> input = jsc.textFile(fileName);
		return input.map(normalizationFunction);
	}
	
	/**
	 * Tokenizes a RDD of text lines and return a RDD of result.
	 * @param input
	 * @return a RDD of tokenized text lines.
	 */
	public JavaRDD<String> tokenize(JavaRDD<String> input) {
		if (verbose) {
			// print some basic statistic about the input, including 
			// max line length, min line length, average line length in syllables
			JavaRDD<Integer> wordCount = input.map(new Function<String, Integer>() {
				private static final long serialVersionUID = 7214093453452927565L;
				@Override
				public Integer call(String line) throws Exception {
					return line.split("\\s+").length;
				}
				
			});
			Comparator<Integer> comp = new IntegerComparator();
			System.out.println("Max line length (in syllables) = " + wordCount.max(comp));
			System.out.println("Min line length (in syllables) = " + wordCount.min(comp));
			float totalCount = wordCount.reduce(new Function2<Integer, Integer, Integer>() {
				private static final long serialVersionUID = 1L;
				@Override
				public Integer call(Integer v1, Integer v2) throws Exception {
					return v1 + v2;
				}
			});
			System.out.println("Avg line length (in syllables) = " + (totalCount) / input.count());
		}
		
		JavaRDD<String> output = null;
		if (classifier == null) {
			// use phrase graph approach (shortest paths and bigram model)
			// to segment phrases
			output = input.map(new SegmentationFunction());
		} else {
			// use logistic regression approach to segment phrases
			JavaRDD<String> s = input.map(new SegmentationFunction());
			// make sure that the preceding lazy computation has been evaluated
			// so that whitespace contexts have been properly accumulated
			System.out.println("Number of text lines = " + s.count());
			System.out.println("Number of contexts = " + contexts.value().size());
			// use whitespace classification approach (logistic regresion model)
			JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts.value());
			DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class);
			DataFrame df1 = model.transform(df0);
			prediction = jsc.broadcast(df1.select("prediction").collect());
			if (df1.count() > 0) {
				output = s.map(new WhitespaceClassificationFunction());
			}
			else { 
				System.err.println("Empty data frame!");
			}
		}
		if (verbose) {
			// print number of non-space characters of the input and output dataset
			System.out.println("#(non-space characters of input) = " + numCharacters(input));
			if (output != null) {
				System.out.println("#(non-space characters of output) = " + numCharacters(output));
			}
		}
		return output;
	}
	
	
	/**
	 * Tokenizes a line. 
	 * @param line a line of text
	 * @return a result text string
	 */
	public String tokenizeOneLine(String line) {
		List<String> list = new ArrayList<String>();
		list.add(line);
		JavaRDD<String> input = jsc.parallelize(list);
		JavaRDD<String> output = tokenize(input);
		return output.first();
	}
	
	/**
	 * Tokenizes a text file and returns a list of tokens.
	 * @param fileName
	 * @return a list of tokens.
	 */
	public List<String> tokenize(String fileName) {
		JavaRDD<String> input = readTextFile(fileName);
		JavaRDD<String> output = tokenize(input);
		return output.collect();
	}
	
	/**
	 * Tokenizes a text file and saves the result to an output directory by using 
	 * Spark save as text file utility.
	 * @param inputFileName
	 * @param outputDirectory
	 */
	public void tokenize(String inputFileName, String outputDirectory) {
		JavaRDD<String> input = readTextFile(inputFileName);
		JavaRDD<String> output = tokenize(input);
		output.saveAsTextFile(outputDirectory);
	}
	
	
	/**
	 * Tokenizes a text file and writes the result to a writer. 
	 * @param inputFileName
	 * @param writer
	 * @return tokenization result.
	 */
	public List<String> tokenize(String inputFileName, PrintWriter writer) {
		JavaRDD<String> input = readTextFile(inputFileName);
		JavaRDD<String> output = tokenize(input);
		List<String> lines = output.collect();
		for (String line : lines) {
			writer.write(line);
			writer.write('\n');
		}
		writer.flush();
		return lines;
	}
	
	/**
	 * Tokenizes a text specified in an URL and writes the result to 
	 * a writer.
	 * @param url
	 * @param writer
	 */
	public List<String> tokenize(URL url, PrintWriter writer) {
		try {
			System.out.println("Extracting the text content of the URL...");
			String text = ArticleExtractor.INSTANCE.getText(new InputStreamReader(url.openStream(), "UTF-8"));
			if (verbose) {
				System.out.println("URL text content:");
				System.out.println(text);
			}
			System.out.println("Tokenizing the content...");
			JavaRDD<String> input = jsc.parallelize(Arrays.asList(text.split("\\n+")));
			JavaRDD<String> output = tokenize(input.map(normalizationFunction));
			List<String> lines = output.collect();
			for (String line : lines) {
				writer.write(line);
				writer.write('\n');
			}
			writer.flush();
			return lines;
		} catch (BoilerpipeProcessingException e) {
			e.printStackTrace();
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	
	
	/**
	 * Counts the number of non-space characters in this data set. This utility method 
	 * is used to check the tokenization result.
	 * @param lines
	 * @return number of characters
	 */
	int numCharacters(JavaRDD<String> lines) {
		JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
			private static final long serialVersionUID = -2189399343462982586L;
			@Override
			public Integer call(String line) throws Exception {
				line = line.replaceAll("[\\s_]+", "");
				return line.length();
			}
		});
		return lengths.reduce(new Function2<Integer, Integer, Integer>() {
			private static final long serialVersionUID = -8438072946884289401L;

			@Override
			public Integer call(Integer e0, Integer e1) throws Exception {
				return e0 + e1;
			}
		});
	}
	
	/**
	 * Adds more words to the lexicon.
	 * @param words a list of words.
	 */
	public void addWords(List<String> words) {
		addWords(words.toArray(new String[words.size()]));
	}
	
	/**
	 * Adds more words to the lexicon.
	 * @param words a string of words.
	 */
	public void addWords(String[] words) {
		for (String word : words)
			lexicon.addWord(word);
	}
	
	class SegmentationFunction implements Function<String, String> {
		private static final long serialVersionUID = 7206190528458789338L;

		@Override
		public String call(String sentence) throws Exception {
			Iterable<Tuple2<String, String>> tokens = segment(sentence);
			StringBuilder sb = new StringBuilder(64);
			for (Tuple2<String, String> token : tokens) {
				String content = token._2();
				if (!token._1().equals("phrase")) {
					content = content.replace(' ', '_');
				}
				sb.append(content);
				sb.append(' ');
			}
			return sb.toString().trim();
		}
		
		/**
		 * Segments a sentence, return an iterable of (token, type).
		 * @param sentence
		 * @return an iterable of (token, type) pairs.
		 */
		private Iterable<Tuple2<String, String>> segment(String sentence) {
			List<Tuple2<String, String>> tokens = new ArrayList<Tuple2<String, String>>();
			sentence = sentence.trim();
			if (sentence.length() == 0) {
				return tokens;
			}
			String s = sentence;
			while (true) {
				int maxLen = 0;
				String nextToken = "";
				String type = "";
				// greedy search for the longest pattern
				// from the beginning of 's'
				for (String patternName : patterns.keySet()) {
					Pattern pattern = patterns.get(patternName);
					Matcher matcher = pattern.matcher(s);
					if (matcher.lookingAt()) {
						int len = matcher.end() - matcher.start();
						if (maxLen < len) {
							maxLen = len;
							nextToken = matcher.group();
							type = patternName;
						}
					}
				}
				nextToken = nextToken.trim();
				if (nextToken.length() > 0) {
					s = s.substring(maxLen).trim();
					if (type.contains("name") && s.length() > 0) {
						Tuple2<String, String> tup = processName(nextToken, s);
						if (tup._1().length() != nextToken.length()) {
							nextToken = tup._1();
							s = tup._2();
							type = "word";
						}
						tokens.add(new Tuple2<String, String>(type, nextToken));
					} else if (type.contains("unit") && s.length() > 0) {
						Tuple2<String, String> tup = processUnit(nextToken, s);
						if (tup._1().length() > nextToken.length()) {
							nextToken = tup._1();
							s = tup._2();
							type = "unit";
						}
						tokens.add(new Tuple2<String, String>(type, nextToken));
					} else if (type.contains("phrase")) {
						if (nextToken.indexOf(' ') > 0) { // multi-syllabic phrase
							if (classifier != null) {
								contexts.add(classifier.makeContexts(nextToken));
								tokens.add(new Tuple2<String, String>("phrase", "[" + nextToken + "]"));
							} else {
								// segment the phrase using a phrase graph  
								List<String> words = tokenizePhrase(nextToken);
								if (words != null) {
									for (int i = 0; i < words.size(); i++) {
										tokens.add(new Tuple2<String, String>("word", words.get(i)));
									}
								} else {
									System.out.println("Error when tokenizing phrase: " + nextToken);
								}
							}
						} else { // mono-syllabic phrase, don't need to tokenize the phrase
							tokens.add(new Tuple2<String, String>("word", nextToken));
						}
					}  else { // next token is not a name, an unit or a phrase
						tokens.add(new Tuple2<String, String>(type, nextToken));
					}
				} else {
					if (s.trim().length() > 0) {
						System.out.println("Unprocessed substring: " + s);
					}
					break;
				}
				if (s.length() == 0) {
					break;
				}
			}
			return tokens;
		}
		
		private Tuple2<String, String> processName(String currentToken, String s) {
			// If this is a name pattern, we process it further to capture 2 cases: 
			//
			// 1. It should be merged with the next syllable, like in "Thủ tướng" or "Bộ Giáo dục." where 
			// the name pattern captures only the first part "Thủ" or "Bộ Giáo".
			// We try to combine the last syllable of the current token with the first token of s 
			// to see whether they may form a word or not, note that the first token of s may contain 
			// delimiters (like "dục." in the example above); we therefore need to remove them 
			// beforehand if necessary.
			int j = s.indexOf(' ');
			String nextSyllable = (j > 0) ? s.substring(0, j) : s;
			// s can either be "dục" or "dục.", find the last alphabetic character of s
			// so as to leave the non-alphabetic characters out.
			int u = nextSyllable.length();
			while (u > 0 && !Character.isAlphabetic(nextSyllable.charAt(--u)));
			nextSyllable = nextSyllable.substring(0, u+1);
			
			int k = currentToken.lastIndexOf(' ');
			String lastSyllable = (k > 0) ? currentToken.substring(k+1) : currentToken;
			String nextTokenPrefix = (k > 0) ? currentToken.substring(0, k+1) : "";  
			String w = lastSyllable.toLowerCase() + ' ' + nextSyllable;
			if (lexicon.hasWord(w)) {
				currentToken = nextTokenPrefix + lastSyllable + ' ' + nextSyllable;
				s = s.substring(nextSyllable.length()).trim();
			}
			// 2. It should be divided into two parts if the first syllable of the name 
			// is a name prefix like "Ông", "Bà", "Anh", "Em", etc.
			j = currentToken.indexOf(' ');
			if (j > 0) {
				String firstSyllable = currentToken.substring(0, j);
				Matcher matcher = patterns.get("prefix").matcher(firstSyllable);
				if (matcher.matches()) {
					StringBuilder sb = new StringBuilder(currentToken.substring(j+1));
					sb.append(' ');
					sb.append(s);
					s = sb.toString();
					currentToken = firstSyllable;
				}
			}
			return new Tuple2<String, String>(currentToken, s);
		}
		
		private Tuple2<String, String> processUnit(String currentToken, String s) {
			// "[đồng/đô] [la Mỹ...]" => [đồng/đô la] [Mỹ...] 
			// "[đồng/cổ phiếu] [...]"
			String lastSyllable = currentToken.substring(currentToken.indexOf('/') + 1);
			int j = s.indexOf(' ');
			String nextSyllable = (j > 0) ? s.substring(0, j) : s;
			// s can either be "phiếu" or "phiếu.", find the last alphabetic character of s
			// so as to leave the non-alphabetic characters out.
			int u = nextSyllable.length();
			while (u > 0 && !Character.isAlphabetic(nextSyllable.charAt(--u)));
			nextSyllable = nextSyllable.substring(0, u+1);
			
			if (lexicon.hasWord(lastSyllable + ' ' + nextSyllable)) {
				currentToken = currentToken + ' ' + nextSyllable;
				s = s.substring(nextSyllable.length()).trim();
			}
			return new Tuple2<String, String>(currentToken, s);
		}
		
		/**
		 * Tokenizes a phrase.
		 * @param phrase
		 * @return a list of tokens.
		 */
		private List<String> tokenizePhrase(String phrase) {
			graph.makeGraph(phrase);
			List<LinkedList<Integer>> paths = graph.shortestPaths();
			if (paths.size() > 0) {
//				// print out overlap groups
//				List<Tuple2<Integer, Integer>> ambiguities = graph.overlaps();
//				for (Tuple2<Integer, Integer> tup : ambiguities) {
//					System.out.println(graph.words(tup));
//				}
				LinkedList<Integer> selectedPath = paths.get(paths.size()-1);
				if (bigram != null) {
					int best = graph.select(paths);
					selectedPath = paths.get(best);
				}
				return graph.words(selectedPath);
			}
			if (verbose) {
				System.out.println("Cannot tokenize the following phrase: [" + phrase +"]");
			}
			return null;
		}
		
	}
	
	class PhraseGraph implements Serializable {
		private static final long serialVersionUID = 6761055345566557524L;
		private String[] syllables;
		private int n;
		
		/**
		 * For each vertex v, we store a list of vertices u where (u, v) is an edge
		 * of the graph. This is used to recursively search for all paths on the graph.  
		 */
		private Map<Integer, LinkedList<Integer>> edges = new HashMap<Integer, LinkedList<Integer>>();

		void makeGraph(String phrase) {
			edges.clear();
			syllables = phrase.split("\\s+");
			n = syllables.length;
			if (n > 128) {
				System.out.println("WARNING: Phrase too long (>= 128 syllables), tokenization may be slow...");
				System.out.println(phrase);
			}
			for (int j = 0; j <= n; j++) {
				edges.put(j, new LinkedList<Integer>());
			}
			for (int i = 0; i < n; i++) {
				String token = syllables[i];
				int j = i;
				while (j < n) {
					if (lexicon.hasWord(token)) {
						edges.get(j+1).add(i);
					}
					j++;
					if (j < n) {
						token = token + ' ' + syllables[j];
					}
				}
			}
			// make sure that the graph is connected by adding adjacent 
			// edges if necessary
			for (int i = n; i > 0; i--) {
				if (edges.get(i).size() == 0) { // i cannot reach by any previous node
					edges.get(i).add(i-1);
				}
			}
		}

		/**
		 * Finds all shortest paths from the first node to the last node
		 * of this graph. 
		 * @return a list of paths, each path is a linked list of vertices.
		 */
		public List<LinkedList<Integer>> shortestPaths() {
			Dijkstra dijkstra = new Dijkstra(edges);
			List<LinkedList<Integer>> allPaths = dijkstra.shortestPaths();
			if (verbose) {
				if (allPaths.size() > 16) {
					StringBuilder phrase = new StringBuilder();
					for (String syllable : syllables) {
						phrase.append(syllable);
						phrase.append(' ');
					}
					System.out.printf("This phrase is too ambiguous, giving %d shortest paths!\n\t%s\n", 
							allPaths.size(), phrase.toString().trim());
				}
			}
			return allPaths;
		}
		
		/**
		 * Gets a list of words specified by a given path.
		 * @param path
		 * @return a list of words.
		 */
		public List<String> words(LinkedList<Integer> path) {
			int m = path.size();
			if (m <= 1) 
				return null;
			Integer[] a = path.toArray(new Integer[m]);
			StringBuilder[] tok = new StringBuilder[m-1];
			int i;
			for (int j = 0; j < m-1; j++) {
				// get the token from a[j] to a[j+1] (exclusive)
				tok[j] = new StringBuilder();
				i = a[j];
				tok[j].append(syllables[i]);
				for (int k = a[j]+1; k < a[j+1]; k++) {
					tok[j].append(' ');
					tok[j].append(syllables[k]);
				}
			}
			List<String> result = new LinkedList<String>();
			for (StringBuilder sb : tok) {
				result.add(sb.toString());
			}
			return result;
		}
		
		/**
		 * Gets a sub-sequence of syllables from a segment marking 
		 * the beginning and the end positions.
		 * @param segment
		 * @return
		 */
		public String words(Tuple2<Integer, Integer> segment) {
			StringBuilder sb = new StringBuilder();
			for (int i = segment._1(); i < segment._2(); i++) {
				sb.append(syllables[i]);
				sb.append(' ');
			}
			return sb.toString().trim();
		}
		
		/**
		 * Finds all overlap groups of syllables of this graph. A syllable group is overlap if
		 * it defines multiple shortest paths, e.g. 4 nodes and two paths: [0, 1, 3] or [0, 2, 3]. 
		 * @return a list of syllable groups, each is a tuple of begin and end position.
		 */
		public List<Tuple2<Integer, Integer>> overlaps() {
			List<Tuple2<Integer, Integer>> result = new ArrayList<Tuple2<Integer, Integer>>();
			if (n >= 4) {
				int i = n-1;
				while (i >= 3) {
					if (edges.get(i).contains(i-1) && edges.get(i).contains(i-2)
							&& edges.get(i-1).contains(i-3) && edges.get(i-2).contains(i-3)) {
						result.add(new Tuple2<Integer, Integer>(i-3, i));
						i = i - 4;
					} else {
						i = i - 1;
					}
				}
			}
			return result;
		}
		
		@Override
		public String toString() {
			return edges.toString();
		}
		
		/**
		 * Selects the most likely segmentation from a list 
		 * of different segmentations.
		 * @param paths
		 * @return
		 */
		public int select(List<LinkedList<Integer>> paths) {
			if (bigram != null) {
				int maxIdx = 0;
				double maxVal = Double.MIN_VALUE;
				// find the maximum of log probabilities of segmentations
				for (int j = 0; j < paths.size(); j++) {
					LinkedList<Integer> path = paths.get(j);
					List<String> words = words(path);
					words.add(0, "<s>");
					words.add("</s>");
					double p = 0d;
					for (int w = 1; w < words.size(); w++)
						p += bigram.logConditionalProb(words.get(w-1), words.get(w));
					if (p > maxVal) {
						maxVal = p;
						maxIdx = j;
					}
				}
				return maxIdx;
			}
			return 0;
		}
	}

	class TextNormalizationFunction implements Function<String, String> {
		private static final long serialVersionUID = 5727433453096616457L;
		private Map<String, String> vowels = new HashMap<String, String>();
		private Pattern pattern = Pattern.compile("[hklmst][yỳýỷỹỵ]");  
		private Map<Character, Character> ymap = new HashMap<Character, Character>();

		public TextNormalizationFunction() {
			// initialize the vowel map
			vowels.put("òa", "oà");
			vowels.put("óa", "oá");
			vowels.put("ỏa", "oả");
			vowels.put("õa", "oã");
			vowels.put("ọa", "oạ");
			vowels.put("òe", "oè");
			vowels.put("óe", "oé");
			vowels.put("ỏe", "oẻ");
			vowels.put("õe", "oẽ");
			vowels.put("ọe", "oẹ");
			vowels.put("ùy", "uỳ");
			vowels.put("úy", "uý");
			vowels.put("ủy", "uỷ");
			vowels.put("ũy", "uỹ");
			vowels.put("ụy", "uỵ");
			// initialize the y map
			ymap.put('y', 'y');
			ymap.put('ỳ', 'ì');
			ymap.put('ý', 'í');
			ymap.put('ỷ', 'ỉ');
			ymap.put('ỹ', 'ĩ');
			ymap.put('ỵ', 'ị');
		}
		
		@Override
		public String call(String phrase) throws Exception {
			// normalize all the vowels of the phrase
			for (String u : vowels.keySet()) {
				String v = vowels.get(u);
				phrase = phrase.replace(u, v);
			}
			// normalize the i/y by converting 'y' to 'i'  
			// if 'y' goes after consonants "h, k, l, m, s, t".
			StringBuilder sb = new StringBuilder(phrase);
			Matcher matcher = pattern.matcher(phrase);
			while (matcher.find()) {
				int idx = matcher.start() + 1;
				sb = sb.replace(idx, matcher.end(), String.valueOf(ymap.get(sb.charAt(idx))));
			}
			return sb.toString();
		}
	}
	
	
	class WhitespaceContextAccumulatorParam implements AccumulatorParam<List<WhitespaceContext>> {

		private static final long serialVersionUID = -8140972589714419311L;
		
		@Override
		public List<WhitespaceContext> addInPlace(List<WhitespaceContext> l1,
				List<WhitespaceContext> l2) {
			l1.addAll(l2);
			return l1;
		}

		@Override
		public List<WhitespaceContext> zero(List<WhitespaceContext> list) {
			return new LinkedList<WhitespaceContext>();
		}

		@Override
		public List<WhitespaceContext> addAccumulator(
				List<WhitespaceContext> l1, List<WhitespaceContext> l2) {
			l1.addAll(l2);
			return l1;
		}

	}
	
	class WhitespaceClassificationFunction implements Function<String, String> {
		private static final long serialVersionUID = -8841083728315408933L;
		private Pattern phrase = Pattern.compile("\\[[\\p{Ll}\\s]+\\]");
		
		@Override
		public String call(String sentence) throws Exception {
			StringBuilder r = new StringBuilder(sentence);
			Matcher matcher = phrase.matcher(sentence);
			while (matcher.find()) {
				int u = matcher.start();
				int v = matcher.end();
				String s = sentence.substring(u+1, v-1);
				int i = 0;
				for (int j = 0; j < s.length(); j++)
					if (s.charAt(j) == ' ') i++;
				String[] syllables = s.split("\\s+");
				StringBuilder sb = new StringBuilder(s.length());
				for (int j = 0; j < i; j++) {
					sb.append(syllables[j]);
					if (prediction.value()[counter + j].getDouble(0) == 0) {
						sb.append(' ');
					} else {
						sb.append('_');
					}
				}
				sb.append(syllables[syllables.length-1]);
				counter += i;
				r.replace(u+1, v-1, sb.toString());
			}
			return r.toString();
		}
		
	}
	
	class IntegerComparator implements Comparator<Integer>, Serializable {
		
		private static final long serialVersionUID = 3285846060042662009L;

		@Override
		public int compare(Integer i, Integer j) {
			return i - j;
		}
		
	}
	
	public void setVerbose(boolean verbose) {
		this.verbose = verbose;
	}
	
	/**
	 * For internal test only.
	 * @param args
	 * @throws MalformedURLException 
	 */
	public static void main(String[] args) throws MalformedURLException {
		String master = "local[*]";
		String inputFileName = "";
		String outputFileName = "";
		String url = "";
		
		Options options = new Options();
		options.addOption("m", true, "master");
		options.addOption("i", true, "input file name)");
		options.addOption("o", true, "output file name");
		options.addOption("u", true, "input URL");
		options.addOption("v", false, "verbose");
		options.addOption("s", false, "whitespace classification");
		CommandLineParser parser = new PosixParser();
		CommandLine cm;
		try {
			cm = parser.parse(options, args);
			if (cm.hasOption("m")) {
				master = cm.getOptionValue("m");
			}
			String dataFolder = "/export/dat/tok";
			Tokenizer tokenizer = null;
			if (cm.hasOption("s")) {
				// use whitespace classification
				tokenizer = new Tokenizer(master, dataFolder + "/lexicon.xml", 
						dataFolder + "/regexp.txt", dataFolder + "/whitespace.model", true);
			} else {
				// use a bigram model
				tokenizer = new Tokenizer(master, dataFolder + "/lexicon.xml", 
						dataFolder + "/regexp.txt", dataFolder + "/syllables2M.arpa");
			}
			if (cm.hasOption("v")) {
				tokenizer.setVerbose(true);
			}
			if (cm.hasOption("i")) {
				inputFileName = cm.getOptionValue("i");
			}
			if (cm.hasOption("u")) {
				url = cm.getOptionValue("u");
			}
			if (inputFileName.length() == 0 && url.length() == 0) {
				System.err.println("Either an input file or an URL must be provided!");
				System.exit(1);
			} else if (inputFileName.length() > 0) {
				if (cm.hasOption("o")) {
					outputFileName = cm.getOptionValue("o");
					tokenizer.tokenize(inputFileName, outputFileName);
				} else {
					try {
						PrintWriter writer = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
						tokenizer.tokenize(inputFileName, writer);
					} catch (UnsupportedEncodingException e) {
						e.printStackTrace();
					}
				}
			} else {
				try {
					PrintWriter writer = null;
					if (cm.hasOption("o")) {
						outputFileName = cm.getOptionValue("o");
						writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFileName), "UTF-8"));
						tokenizer.tokenize(new URL(url), writer);
						writer.close();
					} else {
						writer = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
						tokenizer.tokenize(new URL(url), writer);
					}
				} catch (UnsupportedEncodingException | FileNotFoundException e) {
					e.printStackTrace();
				}
			}
		} catch (ParseException e) {
			e.printStackTrace();
		}
		System.out.println("Done.");
	}
}