/*******************************************************************************
 * Copyright 2014 A3 lab (Dipartimento di Informatica, Università di Pisa)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.acubelab.tagme.preprocessing;

import it.acubelab.Chars;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.lang.MutableString;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.net.URLCodec;


public class WikiTextExtractor {

	public static final int
		MIN_ABSTRACT_CHARS = 100,
		MAX_ABSTRACT_CHARS = 300;
	
	static final String[] GARBAGE_TAGS =
		{"ref", "gallery", "timeline", "noinclude", "pre", "table", "source",
		"tr", "td", "ul", "li", "ol", "dl", "dt", "dd", "menu", "dir"};
	
	static final String[] WRAPPER_TAGS =
		{"nowiki", "cite", "hiero", "div", "font", "span", "strong",
        "strike", "blockquote", "tt", "var", "sup", "sub", "big", "small",
        "center", "h1", "h2", "h3", "em", "b", "i", "u", "a", "s", "p"};
	
	static final String[] SINGLE_TAGS =
		{"references", "ref", "img", "br", "hr", "li", "dt", "dd"};
	
	static final String[] PLACEHOLDER_TAGS = {"code", "math"};
	
	public static final String[][] HTML_ENTITIES = {
	{" "," "}, {"¡","\u00A1"}, {"¢","\u00A2"},
    {"£","\u00A3"}, {"¤","\u00A4"}, {"¥","\u00A5"},
    {"¦","\u00A6"}, {"§","\u00A7"}, {"¨","\u00A8"},
    {"©","\u00A9"}, {"ª","\u00AA"}, {"«","\u00AB"},
    {"¬","\u00AC"}, {"­","\u00AD"}, {"®","\u00AE"},
    {"¯","\u00AF"}, {"°","\u00B0"}, {"±","\u00B1"},
    {"²","\u00B2"}, {"³","\u00B3"}, {"´","\u00B4"},
    {"µ","\u00B5"}, {"¶","\u00B6"}, {"·","\u00B7"},
    {"¸","\u00B8"}, {"¹","\u00B9"}, {"º","\u00BA"},
    {"»","\u00BB"}, {"¼","\u00BC"}, {"½","\u00BD"},
    {"¾","\u00BE"}, {"¿","\u00BF"}, {"À","\u00C0"},
    {"Á","\u00C1"}, {"Â","\u00C2"}, {"Ã","\u00C3"},
    {"Ä","\u00C4"}, {"Å","\u00C5"}, {"Æ","\u00C6"},
    {"Ç","\u00C7"}, {"È","\u00C8"}, {"É","\u00C9"},
    {"Ê","\u00CA"}, {"Ë","\u00CB"}, {"Ì","\u00CC"},
    {"Í","\u00CD"}, {"Î","\u00CE"}, {"Ï","\u00CF"},
    {"Ð","\u00D0"}, {"Ñ","\u00D1"}, {"Ò","\u00D2"},
    {"Ó","\u00D3"}, {"Ô","\u00D4"}, {"Õ","\u00D5"},
    {"Ö","\u00D6"}, {"×","\u00D7"}, {"Ø","\u00D8"},
    {"Ù","\u00D9"}, {"Ú","\u00DA"}, {"Û","\u00DB"},
    {"Ü","\u00DC"}, {"Ý","\u00DD"}, {"Þ","\u00DE"},
    {"ß","\u00DF"}, {"à","\u00E0"}, {"á","\u00E1"},
    {"â","\u00E2"}, {"ã","\u00E3"}, {"ä","\u00E4"},
    {"å","\u00E5"}, {"æ","\u00E6"}, {"ç","\u00E7"},
    {"è","\u00E8"}, {"é","\u00E9"}, {"ê","\u00EA"},
    {"ë","\u00EB"}, {"ì","\u00EC"}, {"í","\u00ED"},
    {"î","\u00EE"}, {"ï","\u00EF"}, {"ð","\u00F0"},
    {"ñ","\u00F1"}, {"ò","\u00F2"}, {"ó","\u00F3"},
    {"ô","\u00F4"}, {"õ","\u00F5"}, {"ö","\u00F6"},
    {"÷","\u00F7"}, {"ø","\u00F8"}, {"ù","\u00F9"},
    {"ú","\u00FA"}, {"û","\u00FB"}, {"ü","\u00FC"},
    {"ý","\u00FD"}, {"þ","\u00FE"}, {"ÿ","\u00FF"},
    {"ƒ","\u0192"}, {"Α","\u0391"}, {"Β","\u0392"},
    {"Γ","\u0393"}, {"Δ","\u0394"}, {"Ε","\u0395"},
    {"Ζ","\u0396"}, {"Η","\u0397"}, {"Θ","\u0398"},
    {"Ι","\u0399"}, {"Κ","\u039A"}, {"Λ","\u039B"},
    {"Μ","\u039C"}, {"Ν","\u039D"}, {"Ξ","\u039E"},
    {"Ο","\u039F"}, {"Π","\u03A0"}, {"Ρ","\u03A1"},
    {"Σ","\u03A3"}, {"Τ","\u03A4"}, {"Υ","\u03A5"},
    {"Φ","\u03A6"}, {"Χ","\u03A7"}, {"Ψ","\u03A8"},
    {"Ω","\u03A9"}, {"α","\u03B1"}, {"β","\u03B2"},
    {"γ","\u03B3"}, {"δ","\u03B4"}, {"ε","\u03B5"},
    {"ζ","\u03B6"}, {"η","\u03B7"}, {"θ","\u03B8"},
    {"ι","\u03B9"}, {"κ","\u03BA"}, {"λ","\u03BB"},
    {"μ","\u03BC"}, {"ν","\u03BD"}, {"ξ","\u03BE"},
    {"ο","\u03BF"}, {"π","\u03C0"}, {"ρ","\u03C1"},
    {"ς","\u03C2"}, {"σ","\u03C3"}, {"τ","\u03C4"},
    {"υ","\u03C5"}, {"φ","\u03C6"}, {"χ","\u03C7"},
    {"ψ","\u03C8"}, {"ω","\u03C9"}, {"ϑ","\u03D1"},
    {"ϒ","\u03D2"}, {"ϖ","\u03D6"}, {"•","\u2022"},
    {"…","\u2026"}, {"′","\u2032"}, {"″","\u2033"},
    {"‾","\u203E"}, {"⁄","\u2044"}, {"℘","\u2118"},
    {"ℑ","\u2111"}, {"ℜ","\u211C"}, {"™","\u2122"},
    {"ℵ","\u2135"}, {"←","\u2190"}, {"↑","\u2191"},
    {"→","\u2192"}, {"↓","\u2193"}, {"↔","\u2194"},
    {"↵","\u21B5"}, {"⇐","\u21D0"}, {"⇑","\u21D1"},
    {"⇒","\u21D2"}, {"⇓","\u21D3"}, {"⇔","\u21D4"},
    {"∀","\u2200"}, {"∂","\u2202"}, {"∃","\u2203"},
    {"∅","\u2205"}, {"∇","\u2207"}, {"∈","\u2208"},
    {"∉","\u2209"}, {"∋","\u220B"}, {"∏","\u220F"},
    {"∑","\u2211"}, {"−","\u2212"}, {"∗","\u2217"},
    {"√","\u221A"}, {"∝","\u221D"}, {"∞","\u221E"},
    {"∠","\u2220"}, {"∧","\u2227"}, {"∨","\u2228"},
    {"∩","\u2229"}, {"∪","\u222A"}, {"∫","\u222B"},
    {"∴","\u2234"}, {"∼","\u223C"}, {"≅","\u2245"},
    {"≈","\u2248"}, {"≠","\u2260"}, {"≡","\u2261"},
    {"≤","\u2264"}, {"≥","\u2265"}, {"⊂","\u2282"},
    {"⊃","\u2283"}, {"⊄","\u2284"}, {"⊆","\u2286"},
    {"⊇","\u2287"}, {"⊕","\u2295"}, {"⊗","\u2297"},
    {"⊥","\u22A5"}, {"⋅","\u22C5"}, {"⌈","\u2308"},
    {"⌉","\u2309"}, {"⌊","\u230A"}, {"⌋","\u230B"},
    {"⟨","\u2329"}, {"⟩","\u232A"}, {"◊","\u25CA"},
    {"♠","\u2660"}, {"♣","\u2663"}, {"♥","\u2665"},
    {"♦","\u2666"}, {""","\""}, {"<","\u003C"},
    {">","\u003E"}, {"Œ","\u0152"}, {"œ","\u0153"},
    {"Š","\u0160"}, {"š","\u0161"}, {"Ÿ","\u0178"},
    {"ˆ","\u02C6"}, {"˜","\u02DC"}, {" ","\u2002"},
    {" ","\u2003"}, {" ","\u2009"}, {"‌","\u200C"},
    {"‍","\u200D"}, {"‎","\u200E"}, {"‏","\u200F"},
    {"–","\u2013"}, {"—","\u2014"}, {"‘","\u2018"},
    {"’","\u2019"}, {"‚","\u201A"}, {"“","\u201C"},
    {"”","\u201D"}, {"„","\u201E"}, {"†","\u2020"},
    {"‡","\u2021"}, {"‰","\u2030"}, {"‹","\u2039"},
    {"›","\u203A"}, {"€","\u20AC"}};
	
	Pattern patHtmlComments;
	Vector<Pattern> patsGarbageTags = new Vector<Pattern>();
	Vector<Pattern> patsWrapperTags = new Vector<Pattern>();
	Vector<Pattern> patsSingleTags = new Vector<Pattern>();
	Vector<Pattern> patsPlaceHolderTags = new Vector<Pattern>();
	Pattern patTable;
	Pattern patGoodWikiLink;
	Pattern patBadLeftWikiLink;
	Pattern patBadRightWikiLink;
	Pattern patHttpLink;
	Pattern patBadHttpLink;
	Pattern patBold;
	Pattern patItalic;
	Pattern patAposBold;
	Pattern patAposItalic;
	Pattern patNumericHtmlEntities;
	Pattern patMultiSpace;
	Pattern patMultiDot;
	
	
	public WikiTextExtractor()
	{
		patHtmlComments = Pattern.compile("<!--.*?-->", Pattern.DOTALL);
		
		for(String tag : GARBAGE_TAGS)
			patsGarbageTags.add(Pattern.compile("<\\s*"+tag+"(\\s*| [^/]+?)>.*?<\\s*/\\s*"+tag+"\\s*>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE));
		
		for(String tag : WRAPPER_TAGS){
			//left
			patsWrapperTags.add(Pattern.compile("<\\s*"+tag+"(\\s*| [^/]+?)>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE));
			//right
			patsWrapperTags.add(Pattern.compile("<\\s*/\\s*"+tag+"\\s*>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE));
		}
		
		for(String tag : SINGLE_TAGS)
		{
			//good
			patsSingleTags.add(Pattern.compile("<\\s*"+tag+"(\\s*| .+?)/\\s*>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE));
			//bad
			patsSingleTags.add(Pattern.compile("<\\s*(/|\\\\)?\\s*"+tag+"(\\s*| [^/]+?)\\\\?\\s*>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE));
		}
			
		for(String tag : PLACEHOLDER_TAGS)
			patsPlaceHolderTags.add(Pattern.compile("<\\s*"+tag+"(\\s*| [^/]+?)>.*?<\\s*/\\s*"+tag+"\\s*>", Pattern.DOTALL|Pattern.CASE_INSENSITIVE));
		
		patTable = Pattern.compile("\\{[^\\{]*?\\}", Pattern.DOTALL);
		
		patGoodWikiLink = Pattern.compile("\\[\\[[^\\[]*?\\]\\]", Pattern.DOTALL);
		patBadLeftWikiLink = Pattern.compile("\\[[^\\[]*?\\]\\]", Pattern.DOTALL);
		patBadRightWikiLink = Pattern.compile("\\[\\[[^\\[]*?\\]", Pattern.DOTALL);
		
		patHttpLink = Pattern.compile("\\[http.*?\\]", Pattern.DOTALL);
		//some idiot contributors put http links without brakets!!!
		patBadHttpLink = Pattern.compile("(http|https|ftp)\\://([a-zA-Z0-9\\-\\.]+)(:[0-9]*)?/?([a-zA-Z0-9\\-\\._\\?\\,\\'/\\\\+&amp;%\\$#\\=~])*[^\\.\\,\\)\\(\\s]");
		
		patBold = Pattern.compile("(\\w')??('''.+?''')", Pattern.DOTALL);
		patItalic = Pattern.compile("(\\w')??(''.+?'')", Pattern.DOTALL);
		patAposBold = Pattern.compile("\\w'('''.+?''')", Pattern.DOTALL);
		patAposItalic = Pattern.compile("\\w'(''.+?'')", Pattern.DOTALL);
		
		patNumericHtmlEntities = Pattern.compile("&#\\d+?;");
		
		patMultiSpace = Pattern.compile(" {2,}");
		patMultiDot = Pattern.compile("\\.{4,}");
	}
	
	public static interface LinkHandler
	{
		public int handleLink(MutableString input, int start, int end, WikiLink link);
		public boolean removeTrailinBrackets();
	}
	public static final LinkHandler ANCHOR_REPLACER = new LinkHandler() {
		@Override
		public int handleLink(MutableString input, int start, int end, WikiLink link) {
			if (link == WikiLink.EMPTY){
				input.delete(start, end);
				return start;
			} else {
				input.replace(start, end, link.anchor);
				return start+link.anchor.length();
//				return 0;
			}
		}
		@Override
		public boolean removeTrailinBrackets() {
			return true;
		}
	};
	public static final LinkHandler NO_REPLACER = new LinkHandler() {
		@Override
		public int handleLink(MutableString input, int start, int end, WikiLink link) {
			if (link == WikiLink.EMPTY){
				input.delete(start, end);
				return start;
			} else {
				return end;
			}
		}
		@Override
		public boolean removeTrailinBrackets() {
			return false;
		}
	};
	private static class LinkCatcher implements LinkHandler{
		List<WikiLink> links = new ArrayList<WikiLink>();
		@Override
		public boolean removeTrailinBrackets() {
			return ANCHOR_REPLACER.removeTrailinBrackets();
		}
		@Override
		public int handleLink(MutableString input, int start, int end, WikiLink link) {
			if (link != WikiLink.EMPTY) links.add(link);
			return ANCHOR_REPLACER.handleLink(input, start, end, link);
		}
	}

	
	
	public String clean(String input, LinkHandler handler)
	{
		return clean(new MutableString(input), handler).toString();
	}
	
	/**
	 * Remove most of media-wiki syntax, NOT ALL! Use also {@link WikiTextExtractor#removeStructure(MutableString, boolean)} to
	 * get a clean text.
	 * @param input
	 * @param handler
	 * @return
	 */
	public MutableString clean (MutableString input, LinkHandler handler)
	{
		MutableString output = new MutableString(input.length());
		output.append(input);
		output.loose();
		//make tags readable
		output.replace("&lt;", "<").replace("&gt;",">");
		output.replace("<<", "«").replace(">>","»");
		
		//delete html comments
		output = Chars.delete(patHtmlComments, output);
		
		//delete html garbage tags
		for(Pattern p : patsGarbageTags)
			output = Chars.delete(p,output);
		
		//delete html wrapper tags
		for(Pattern p : patsWrapperTags)
			output = Chars.delete(p,output);
		
		//delete single tags
		for(Pattern p : patsSingleTags)
			output = Chars.delete(p,output);
		
		//delete placeholder tags
		for(Pattern p : patsPlaceHolderTags)
			output = Chars.delete(p,output);
		
		//delete templates and tables. max 3 nested table/templates
		output.replace("{{start box}}", "{");
		output.replace("{{end box}}", "}");
		output.replace("{{", "{").replace("}}", "}");
		output.replace("{|", "{").replace("|}", "}");
		output = Chars.delete(patTable, output);
		output = Chars.delete(patTable, output);
		output = Chars.delete(patTable, output);
		
		//delete http link
		output = Chars.delete(patHttpLink, output);
		output = Chars.delete(patBadHttpLink, output);
		output.replace("[]", "");
		
		Matcher m;
		int start;

		//handle bold and italic
		m = patBold.matcher(output);
		start = 0;
		while(m.find(start)){
			String bolded = m.group(2);
			bolded = bolded.substring(3, bolded.length()-3);
			output.replace(m.start(2), m.end(2), bolded);
			start = m.start(2)+bolded.length();
		}
		m = patItalic.matcher(output);
		start = 0;
		while(m.find(start)){
			String ital = m.group(2);
			ital = ital.substring(2, ital.length()-2);
			output.replace(m.start(2), m.end(2), ital);
			start = m.start(2)+ital.length();
		}
		output.replace("''''", "").replace("'''","").replace("''", "");

		//handle special chars
		output.replace("&amp;", "&").replace("&quot;&quot;","&quot;");
		for(String[] ent : HTML_ENTITIES)
			output.replace(ent[0], ent[1]);
		//handle special numeric chars
		m = patNumericHtmlEntities.matcher(output);
		start = 0;
		while(m.find(start))
		{
			String strcode = m.group();
			int code = Integer.parseInt(strcode.substring(2, strcode.length()-1));
			if (code >= 0x10000){
				output.delete(m.start(), m.end());
				start = m.start();
			} else {
				output.replace(m.start(), m.end(), ""+(char)code);
				start = m.start()+1;
			}
		}
		
		//handle some errors
		output.replace('\t',' ');
		output = Chars.replace(patMultiSpace, output, " ");
		output = Chars.replace(patMultiDot, output, "...");
		output.replace(" ,",",").replace(" .",".");
		output.replace(" ;",";").replace(" :",":");
		output.replace(",,",",").replace(",.",".");
		output.replace("( ","(").replace(" )",")");
		output.replace("[ ","[").replace(" ]","]");
		output.replace("« ","«").replace(" »","»");
		
		//handle good wikilink
		start=0;
		m = patGoodWikiLink.matcher(output);
		while(m.find(start)){
			MutableString link = output.substring(m.start()+2, m.end()-2);
			WikiLink l = WikiLink.parse(link);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		//often there are nested links (it is not allowed by Wiki syntax but we found them)
		start=0;
		m = patGoodWikiLink.matcher(output);
		while(m.find(start)){
			MutableString link = output.substring(m.start()+2, m.end()-2);
			WikiLink l = WikiLink.parse(link);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		
		//handle bad left wikilink
		m = patBadLeftWikiLink.matcher(output);
		start = 0;
		while(m.find(start)){
			MutableString link = output.substring(m.start()+1, m.end()-2);
			WikiLink l = WikiLink.parse(link);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		//handle bad right wikilink
		m = patBadRightWikiLink.matcher(output);
		start = 0;
		while(m.find(start)){
			MutableString link = output.substring(m.start()+2, m.end()-1);
			WikiLink l = WikiLink.parse(link);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		//delete trailing brackets
		if (handler.removeTrailinBrackets())
			output.replace("[[", "").replace("]]","");

		
		return output;
	}
	
	public MutableString removeStructure(MutableString input, boolean onlyAbstract)
	{
		
		MutableString buffer = new MutableString(1024);
		FastBufferedReader tokenizer = new FastBufferedReader(input);
		
		MutableString text = new MutableString(2048);
		String punts = ":.;,-";
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				if (text.length() > MIN_ABSTRACT_CHARS && onlyAbstract){
					text.deleteCharAt(text.length()-1);
					return text;					
				}
				
//				MutableString linestr = new MutableString(buffer.trim());
				MutableString linestr = buffer.trim();
				if (linestr.length() == 0) continue;
				
				int start;
				int end;
				String chars;
				char[] line = linestr.array();
				int line_len = linestr.length();
				
				char first = linestr.charAt(0);
				switch (first)
				{
				case '=':{
					chars = " =";
					for(start=0; start <line_len && chars.indexOf(line[start])>=0; start++);
					for(end=line_len-1; end >= 0  && chars.indexOf(line[end])>=0; end--);
					
					if (start < end){
						text.append(linestr.subSequence(start, end+1));
						text.append(". ");
					}
					break;
				}
					
				case '*':
				case '#':
				case ':':
				case ';':{
					
					chars = "*#:; ";
					for(start=0; start<line_len && chars.indexOf(line[start])>=0 ; start++);
					
					if (start < line_len-1){
						text.append(linestr.subSequence(start, linestr.length()));
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					
					break;
				}
				case '{':
				case '|':
					break;
				case '.':
				case '-':{
					linestr.delete(new char[]{'.','-'});
					if (linestr.length() > 0){
						text.append(linestr);
						if (punts.indexOf(text.lastChar())<0)
							text.append('.');
						text.append(' ');
					}
					break;
				}
				default:{
					if (linestr.lastChar() == '}')
						break;
					text.append(linestr);
					if (punts.indexOf(text.lastChar())<0)
						text.append('.');
					text.append(' ');
				}
				}
			}
		} catch (IOException e) {}
		if (text.length()>0) text.deleteCharAt(text.length()-1);
		return text;
	}
	
	public List<WikiLink> extractDisambiguationLinks(MutableString cleanText)
	{
		FastBufferedReader tokenizer = new FastBufferedReader(cleanText);
		MutableString buffer = new MutableString(1024);
		List<WikiLink> links = new ArrayList<WikiLink>();
		
		try {
			while(tokenizer.readLine(buffer) != null)
			{
				buffer.trim();
				if (buffer.length() == 0) continue;
				
				if (buffer.charAt(0) == '*')
				{
					int start = 1;
					for(; start<buffer.length() && buffer.charAt(start)=='*' ; start++);
					buffer.delete(start, buffer.length()).trim();
					
					if (buffer.length() == 0) continue;
//					if (!buffer.startsWith("[[")) continue;
					
					List<WikiLink> lineLinks = extractLinkFromCleanedLine(buffer);
					if (lineLinks.size()>0) links.add(lineLinks.get(0));
				}
			}
		} catch (IOException ioe){}
		
		return links;
		
	}
	
	

	
	
	public MutableString extractAbstract(MutableString input)
	{
		MutableString cleaned = clean(input, ANCHOR_REPLACER);
		return removeStructure(cleaned, true);
	}
	public MutableString extractPage(MutableString input)
	{
		MutableString cleaned = clean(input, ANCHOR_REPLACER);
		return removeStructure(cleaned, false);
	}
	public MutableString extractPageAndLink(MutableString input, List<WikiLink> links)
	{
		LinkCatcher catchingLink = new LinkCatcher();
		MutableString cleaned = clean(input, catchingLink);
		links.addAll(catchingLink.links);
		return removeStructure(cleaned, false);
	}
	public List<WikiLink> extractLinkFromPage(String input)
	{
		return extractLinkFromPage(new MutableString(input));
	}
	public List<WikiLink> extractLinkFromPage(MutableString input)
	{
		LinkCatcher catchingLink = new LinkCatcher();
		clean(input, catchingLink);
		return catchingLink.links;
	}
	
	public List<WikiLink> extractLinkFromCleanedLine(String input)
	{
		return extractLinkFromCleanedLine(new MutableString(input));
	}
	public List<WikiLink> extractLinkFromCleanedLine(MutableString input)
	{
		MutableString output = new MutableString(input);
		output.loose();
		LinkHandler handler = ANCHOR_REPLACER;
		ArrayList<WikiLink> ll = new ArrayList<WikiLink>();
		
		Matcher m;
		int start;
		m = patGoodWikiLink.matcher(output);
		start = 0;
		while(m.find(start))
		{
			WikiLink l = WikiLink.parse(output.substring(m.start()+2, m.end()-2));
			if (l != WikiLink.EMPTY) ll.add(l);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		m = patGoodWikiLink.matcher(output);
		start = 0;
		while(m.find(start))
		{
			WikiLink l = WikiLink.parse(output.substring(m.start()+2, m.end()-2));
			if (l != WikiLink.EMPTY) ll.add(l);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		
		//handle bad left wikilink
		m = patBadLeftWikiLink.matcher(output);
		start = 0;
		while(m.find(start)){
			MutableString link = output.substring(m.start()+1, m.end()-2);
			WikiLink l = WikiLink.parse(link);
			if (l != WikiLink.EMPTY) ll.add(l);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		//handle bad right wikilink
		m = patBadRightWikiLink.matcher(output);
		while(m.find()){
			MutableString link = output.substring(m.start()+2, m.end()-1);
			WikiLink l = WikiLink.parse(link);
			if (l != WikiLink.EMPTY) ll.add(l);
			start = handler.handleLink(output, m.start(), m.end(), l);
		}
		return ll;
	}
	
	static URLCodec cc = new URLCodec();
	static Pattern isEncoded = Pattern.compile("%\\d\\d");
	static Pattern multiSpace = Pattern.compile("\\s{2,}");
	public static MutableString normalizePageName(MutableString page)
	{
		if (page.length() == 0) return page;
		page = page.replace('_', ' ');
		if (page.length() == 0) return page;
		page = Chars.replace(multiSpace, page, " ");
		if (page.length() == 0) return page;
		page.setCharAt(0, Character.toUpperCase(page.firstChar()));
		
		if (page.length() == 0) return page;
		if (isEncoded.matcher(page).find()) {
			try {
				page = new MutableString(cc.decode(page.toString()));
			} catch (DecoderException de) {}
		}
		return page;
	}
}