/*******************************************************************************
 * Copyright 2014 A3 lab (Dipartimento di Informatica, Università di Pisa)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package it.acubelab.tagme.preprocessing;

import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.lang.MutableString;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;

import org.apache.log4j.Logger;

public class WikipediaArticleParser {

	static Logger log = Logger.getLogger(WikipediaArticleParser.class);
	
	private enum State {
		IDLE,
		PAGE,
		TEXT
	}
	
	public static final String
		PAGE_START = "<page>",
		PAGE_END = "</page>",
		ID_START = "<id>",
		ID_END = "</id>",
		TITLE_START = "<title>",
		TITLE_END = "</title>",
		TEXT_START = "<text xml:space=\"preserve\">",
		TEXT_END = "</text>";
	
	
	File input;
	FastBufferedReader reader;
	WikiArticle current;
	State state;
	
	public void parse(File f) throws IOException
	{
		this.input = f;
		start();
		
		MutableString line = new MutableString(1024);
		
		while(reader.readLine(line) != null)
		{
			line.trim();
			switch(state)
			{
			case IDLE:
				if (line.startsWith(PAGE_START)){
					state = State.PAGE;
					current = new WikiArticle();
				}
				break;
			case PAGE:
				if (line.startsWith(TITLE_START) && line.endsWith(TITLE_END) && current.title().length() == 0){
					String title = line.substring(TITLE_START.length(), line.length()-TITLE_END.length()).toString();
					current.setTitle(title);
				} else if (line.startsWith(ID_START) && line.endsWith(ID_END) && current.id() < 0){
					String id = line.substring(ID_START.length(), line.length()-ID_END.length()).toString();
					current.setId(Integer.parseInt(id));
				} else if (line.startsWith(TEXT_START)) {
					if (line.endsWith(TEXT_END)){
						current.setBody(line.substring(TEXT_START.length(), line.length()-TEXT_END.length()));
					} else {
						current.setBody(new MutableString(2048));
						current.body().append(line.substring(TEXT_START.length()));
						current.body().append('\n');
						state = State.TEXT;
					}
				} else if (line.startsWith(PAGE_END)){
					processArticle(current);
					state = State.IDLE;
				}
				break;
			case TEXT:
				if (line.endsWith(TEXT_END)){
					state = State.PAGE;
					current.body().append(line.substring(0, line.length()-TEXT_END.length()));
				} else {
					current.body().append(line);
					current.body().append('\n');
				}
				break;
			}
		}
		
		stop();
		
	}
	
	protected void start() throws IOException
	{
		reader = new FastBufferedReader(new InputStreamReader(new FileInputStream(input), Charset.forName("UTF-8")));
		state = State.IDLE;
	}
	protected void stop() throws IOException
	{
		reader.close();
	}
	public void processArticle(WikiArticle a) throws IOException
	{}
	

	
}