package com.cybozu.labs.nutch.plugin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;

/**
 * 
 * Language Detection Extension for Apache Nutch
 *   using Language Detection Library ( http://code.google.com/p/language-detection/ ).
 * 
 * For HTMLLanguageParser and LanguageQueryFilter, 
 * the extensions of the Nutch's standard language-identifier plugin can be used without modifications, 
 * so it is provides an extension of LanguageIdentifier only.
 * 
 * @author Nakatani Shuyo
 *
 */
public class LanguageDetectionFilter implements IndexingFilter {
	private static final int TEXTSIZE_UPPER_LIMIT_DEFAULT = 10000;
	private Configuration conf = null;
	private LangDetectException cause = null;
	private int textsize_upper_limit;

	/**
	 * Constructor with no parameters (for generation by reflection)
	 */
	public LanguageDetectionFilter() {
	}

	/**
	 * {@inheritDoc}
	 */
	public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
			CrawlDatum datum, Inlinks inlinks) throws IndexingException {
		if (conf == null) {
			throw new IndexingException("Not Yet Initialization.");
		}
		if (cause != null) {
			throw new IndexingException("Initialization Failed.", cause);
		}

		String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
		if (lang == null) {
			StringBuilder text = new StringBuilder();
			text.append(parse.getData().getTitle()).append(" ")
					.append(parse.getText());
			try {
				Detector detector = DetectorFactory.create();
				detector.setMaxTextLength(textsize_upper_limit);
				detector.append(text.toString());
				lang = detector.detect();
			} catch (LangDetectException e) {
				throw new IndexingException("Detection failed.", e);
			}
		}
		if (lang == null) lang = "unknown";

		doc.add("lang", lang);
		return doc;
	}

	/**
	 * {@inheritDoc}
	 */
	public void addIndexBackendOptions(Configuration conf) {
		LuceneWriter.addFieldOptions("lang", LuceneWriter.STORE.YES,
				LuceneWriter.INDEX.UNTOKENIZED, conf);
	}

	/**
	 * {@inheritDoc}
	 */
	public void setConf(Configuration conf) {
		if (this.conf == null) {
			try {
				DetectorFactory.loadProfile(conf.get("langdetect.profile.dir"));
				textsize_upper_limit = conf.getInt("langdetect.textsize", TEXTSIZE_UPPER_LIMIT_DEFAULT);
			} catch (LangDetectException e) {
				// afterward throw when filter() is called
				cause = e;
			}
		}
		this.conf = conf;
	}

	/**
	 * {@inheritDoc}
	 */
	public Configuration getConf() {
		return this.conf;
	}
}