java source code of LuceneHelper

package it.cnr.isti.hpc.dexter.lucene;

/**
 *  Copyright 2012 Salvatore Trani
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

import it.cnr.isti.hpc.dexter.entity.EntityMatch;
import it.cnr.isti.hpc.dexter.entity.EntityMatchList;
import it.cnr.isti.hpc.dexter.spot.SpotMatch;
import it.cnr.isti.hpc.dexter.spot.clean.SpotManager;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.cleaner.QuotesCleaner;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.cleaner.UnderscoreCleaner;
import it.cnr.isti.hpc.dexter.spot.cleanpipe.cleaner.UnicodeCleaner;
import it.cnr.isti.hpc.dexter.util.DexterParams;
import it.cnr.isti.hpc.log.ProgressLogger;
import it.cnr.isti.hpc.text.Text;
import it.cnr.isti.hpc.wikipedia.article.Article;
import it.cnr.isti.hpc.wikipedia.article.ArticleSummarizer;
import it.cnr.isti.hpc.wikipedia.article.Link;
import it.cnr.isti.hpc.wikipedia.article.Template;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * LuceneHelper provides utilities for indexing, retrieving, and ranking
 * Wikipedia articles.
 * 
 * @author Diego Ceccarelli <[email protected]>
 * 
 *         Created on Aug 27, 2013
 */
public class LuceneHelper {

	protected static final String LUCENE_ARTICLE_DEFAULT_FIELD = "content";

	protected static final String LUCENE_ARTICLE_ID = "wiki-id";
	protected static final String LUCENE_ARTICLE_WIKI_TITLE = "wiki-title";
	protected static final String LUCENE_ARTICLE_TITLE = "title";
	protected static final String LUCENE_ARTICLE_TYPE = "type";
	protected static final String LUCENE_ARTICLE_LIST = "list";
	protected static final String LUCENE_ARTICLE_INFOBOX = "infobox";
	protected static final String LUCENE_ARTICLE_EMPH = "emph";
	protected static final String LUCENE_ARTICLE_SECTIONS = "sections";
	protected static final String LUCENE_ARTICLE_DESCRIPTIONS = "desc";
	protected static final String LUCENE_ARTICLE_LINKS = "link";
	protected static final String LUCENE_ARTICLE_CONTENT = "content";
	protected static final String LUCENE_ARTICLE_SUMMARY = "summary";

	/**
	 * Logger for this class
	 */
	private static final Logger logger = LoggerFactory
			.getLogger(LuceneHelper.class);

	/**
	 * The Lucene analyzer
	 */
	protected final StandardAnalyzer ANALYZER = new StandardAnalyzer(
			Version.LUCENE_41, CharArraySet.EMPTY_SET);

	/**
	 * Singleton
	 */
	protected static LuceneHelper dexterHelper;

	protected Directory index;
	protected IndexWriter writer;
	protected IndexSearcher searcher;
	protected final IndexWriterConfig config;
	protected final ArticleSummarizer summarizer;

	private static DexterParams params = DexterParams.getInstance();

	/**
	 * number of documents indexed
	 */
	protected final int collectionSize;

	protected static final FieldType STORE_TERM_VECTORS = new FieldType();
	protected static final FieldType STORE_TERM_VECTORS_NOT_STORED = new FieldType();

	static {
		STORE_TERM_VECTORS.setIndexed(true);
		STORE_TERM_VECTORS.setTokenized(true);
		STORE_TERM_VECTORS.setStored(true);
		STORE_TERM_VECTORS.setStoreTermVectors(true);
		STORE_TERM_VECTORS.freeze();

		STORE_TERM_VECTORS_NOT_STORED.setIndexed(true);
		STORE_TERM_VECTORS_NOT_STORED.setTokenized(true);
		STORE_TERM_VECTORS_NOT_STORED.setStored(false);
		STORE_TERM_VECTORS_NOT_STORED.setStoreTermVectors(true);
		STORE_TERM_VECTORS_NOT_STORED.freeze();
	}

	private static SpotManager cleaner = new SpotManager();

	protected final File wikiIdtToLuceneIdSerialization;
	protected static Map<Integer, Integer> wikiIdToLuceneId;

	static {
		cleaner.add(new UnicodeCleaner());
		cleaner.add(new UnderscoreCleaner());
		cleaner.add(new QuotesCleaner());
	}

	/**
	 * Opens or creates a lucene index in the given directory
	 * 
	 * @param wikiIdtToLuceneIdSerialization
	 *            - the file containing the serialized mapping between wiki-id
	 *            and Lucene documents ids
	 * 
	 * @param indexPath
	 *            - the path of the directory with the Lucene's index
	 */
	protected LuceneHelper(File wikiIdtToLuceneIdSerialization, File indexPath) {
		logger.info("opening lucene index in folder {}", indexPath);
		config = new IndexWriterConfig(Version.LUCENE_41, ANALYZER);
		this.wikiIdtToLuceneIdSerialization = wikiIdtToLuceneIdSerialization;

		BooleanQuery.setMaxClauseCount(1000);

		try {
			index = FSDirectory.open(indexPath);
			// writer.commit();
		} catch (Exception e) {
			logger.error("opening the index: {}", e.toString());
			System.exit(1);
		}

		summarizer = new ArticleSummarizer();
		writer = getWriter();
		collectionSize = writer.numDocs();
		wikiIdToLuceneId = Collections.emptyMap();
	}

	/**
	 * @return an index reader
	 */
	protected IndexReader getReader() {
		IndexReader reader = null;
		try {
			reader = DirectoryReader.open(index);
		} catch (Exception e) {
			logger.error("reading the index: {} ", e.toString());
			System.exit(1);
		}
		return reader;
	}

	protected IndexSearcher getSearcher() {
		if (searcher != null)
			return searcher;
		IndexReader reader = getReader();
		searcher = new IndexSearcher(reader);
		return searcher;
	}

	/**
	 * @return true if the dexter lucene index exists, false otherwise
	 */
	public static boolean hasDexterLuceneIndex() {
		File luceneFolder = params.getIndexDir();
		return luceneFolder.exists();
	}

	/**
	 * Returns an instance of the Dexter's Lucene index.
	 * 
	 * @return an instance of the Dexter's Lucene index
	 */
	public static LuceneHelper getDexterLuceneHelper() {
		if (dexterHelper == null) {
			File luceneFolder = params.getIndexDir();
			File serializedWikiFile = params.getWikiToIdFile();
			dexterHelper = new LuceneHelper(serializedWikiFile, luceneFolder);
		}
		return dexterHelper;
	}

	/**
	 * Loads the map containing the conversion from the Wikipedia ids to the
	 * Lucene Ids.
	 */
	protected void parseWikiIdToLuceneId() {
		logger.warn("no index wikiID -> lucene found - I'll generate");
		IndexReader reader = getReader();
		wikiIdToLuceneId = new HashMap<Integer, Integer>(reader.numDocs());
		ProgressLogger pl = new ProgressLogger(
				"creating wiki2lucene, readed {} docs", 100000);
		int numDocs = reader.numDocs();
		for (int i = 0; i < numDocs; i++) {
			pl.up();
			try {
				Document doc = reader.document(i);
				IndexableField f = doc.getField(LUCENE_ARTICLE_ID);
				Integer wikiId = new Integer(f.stringValue());
				wikiIdToLuceneId.put(wikiId, i);
			} catch (CorruptIndexException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}

		}

	}

	/**
	 * Dumps the map containing the conversion from the Wikipedia ids to the
	 * Lucene Ids.
	 */
	protected void dumpWikiIdToLuceneId() {

		try {
			// Serializes to a file
			ObjectOutput out = new ObjectOutputStream(new FileOutputStream(
					wikiIdtToLuceneIdSerialization));
			out.writeObject(wikiIdToLuceneId);
			out.close();
		} catch (IOException e) {
			logger.info("dumping incoming links in a file ({})", e.toString());
			System.exit(-1);
		}
	}

	/**
	 * Loads the map containing the conversion from the Wikipedia ids to the
	 * Lucene Ids.
	 */
	@SuppressWarnings("unchecked")
	public void loadWikiIdToLuceneId() {

		if (!wikiIdtToLuceneIdSerialization.exists()) {
			logger.info("{} not exists, generating",
					wikiIdtToLuceneIdSerialization);
			parseWikiIdToLuceneId();
			logger.info("storing");
			dumpWikiIdToLuceneId();
			return;
		}

		logger.info("loading wiki id to lucene id ");
		try {

			InputStream is = new BufferedInputStream(new FileInputStream(
					wikiIdtToLuceneIdSerialization));
			@SuppressWarnings("resource")
			ObjectInput oi = new ObjectInputStream(is);
			wikiIdToLuceneId = (Map<Integer, Integer>) oi.readObject();

		} catch (Exception e) {
			logger.info("reading serialized object ({})", e.toString());
			System.exit(-1);
		}
		logger.info("done ");
	}

	/**
	 * @return the Lucene id of an article, given its wikiId
	 */
	protected int getLuceneId(int wikiId) {
		if (wikiIdToLuceneId.isEmpty()) {
			loadWikiIdToLuceneId();
		}

		if (!wikiIdToLuceneId.containsKey(wikiId))
			return -1;
		return wikiIdToLuceneId.get(wikiId);
	}

	/**
	 * Returns the TFIDF-similarity between a given string and an article
	 * 
	 * @param query
	 *            - the query containing the query to compare with the article
	 * @param wikiId
	 *            - the id of the article to compare with the query
	 * @return the TFIDF-similarity between the query and wikiId
	 */
	public float getSimilarity(Query query, int wikiId) {
		searcher = getSearcher();
		int docId = getLuceneId(wikiId);
		Explanation e = null;
		try {
			e = searcher.explain(query, docId);
		} catch (IOException e1) {
			logger.error("getting similarity between text and doc {} ", wikiId);
			return 0;
		}
		return e.getValue();
	}

	/**
	 * Returns the cosine similarity between two documents
	 * 
	 * @param x
	 *            - the WikiId of the first document
	 * @param y
	 *            - the WikiId of the first document
	 * 
	 * @return a double between 0 (not similar) and 1 (same content),
	 *         representing the similarity between the 2 documents
	 */
	public double getCosineSimilarity(int x, int y) {
		return getCosineSimilarity(x, y, LUCENE_ARTICLE_DEFAULT_FIELD);
	}

	/**
	 * Returns the cosine similarity between two documents
	 * 
	 * @param x
	 *            - the WikiId of the first document
	 * @param y
	 *            - the WikiId of the first document
	 * @param field
	 *            - the field on which to compute the similarity
	 * 
	 * @return a double between 0 (not similar) and 1 (same content),
	 *         representing the similarity between the 2 documents
	 */
	public double getCosineSimilarity(int x, int y, String field) {

		IndexReader reader = getReader();
		Terms tfvX = null;
		Terms tfvY = null;
		try {
			tfvX = reader.getTermVector(getLuceneId(x), field);
			tfvY = reader.getTermVector(getLuceneId(y), field);

			// try {
			// tfvX = reader.document(idX).getBinaryValue("asd")
			// getTermFreqVectors(idX);
			// tfvY = reader.getTermFreqVectors(idY);
		} catch (IOException e) {
			logger.error("computing cosine similarity ({}) ", e.toString());
			System.exit(-1);
		}

		Map<String, Integer> xfrequencies = new HashMap<String, Integer>();
		Map<String, Integer> yfrequencies = new HashMap<String, Integer>();
		TermsEnum xtermsEnum = null;
		try {
			xtermsEnum = tfvX.iterator(null);

			BytesRef text;

			while ((text = xtermsEnum.next()) != null) {
				String term = text.utf8ToString();
				int freq = (int) xtermsEnum.totalTermFreq();
				xfrequencies.put(term, freq);
			}

			TermsEnum ytermsEnum = tfvY.iterator(null);
			while ((text = ytermsEnum.next()) != null) {
				String term = text.utf8ToString();
				int freq = (int) ytermsEnum.totalTermFreq();
				yfrequencies.put(term, freq);
			}

		} catch (IOException e) {
			logger.error("computing cosine similarity ({}) ", e.toString());
			System.exit(-1);
		}
		Map<String, Double> xTfidf = new HashMap<String, Double>();
		Map<String, Double> yTfidf = new HashMap<String, Double>();
		double xnorm = tfidfVector(xTfidf, xfrequencies, field);
		double ynorm = tfidfVector(yTfidf, yfrequencies, field);

		double dotproduct = 0;

		for (Map.Entry<String, Double> k : xTfidf.entrySet()) {
			if (yTfidf.containsKey(k.getKey())) {
				logger.info("key {} ", k.getKey());
				logger.info("key x {} y {} ", k.getValue(),
						yTfidf.get(k.getKey()));
				dotproduct += k.getValue() * yTfidf.get(k.getKey());
				logger.info("dotproduct {} ", dotproduct);
			}

		}
		return dotproduct / (xnorm * ynorm);

	}

	/**
	 * Builds the TFIDF vector and its norm2
	 * 
	 * @param tfidf
	 *            - the vector containing for each term its TFIDF score, it will
	 *            be populated by this method
	 * @param freq
	 *            - the vector containing for each term its frequency
	 * @param field
	 *            - the field on which to compute the inverse document frequency
	 * 
	 * @return the norm of the TFIDF vector
	 * 
	 */
	private double tfidfVector(Map<String, Double> tfidf,
			Map<String, Integer> freq, String field) {
		IndexReader reader = getReader();

		double norm = 0;
		for (Map.Entry<String, Integer> entry : freq.entrySet()) {
			Term t = new Term(field, entry.getKey());
			int df = 0;
			try {
				df = reader.docFreq(t);
			} catch (IOException e) {
				logger.error("computing tfidfVector ({}) ", e.toString());
				System.exit(-1);
			}
			double idf = Math.log(collectionSize / (double) df + 1)
					/ Math.log(2) + 1;
			double tfidfValue = entry.getValue() * idf;
			norm += tfidfValue * tfidfValue;
			tfidf.put(entry.getKey(), tfidfValue);
		}
		return Math.sqrt(norm);

	}

	/**
	 * Converts an article to a Lucene Index
	 * 
	 * @param a
	 *            - a Wikipedia Article to index
	 * @return the Lucene Document representing the Wikipedia Article
	 */
	private Document toLuceneDocument(Article a) {
		Document d = new Document();
		d.add(new TextField(LUCENE_ARTICLE_TITLE, a.getTitle(), Field.Store.YES));
		d.add(new IntField(LUCENE_ARTICLE_ID, a.getWid(), Field.Store.YES));
		d.add(new StringField(LUCENE_ARTICLE_WIKI_TITLE, a.getWikiTitle(),
				Field.Store.YES));
		d.add(new StringField(LUCENE_ARTICLE_TYPE, String.valueOf(a.getType()),
				Field.Store.YES));
		for (List<String> l : a.getLists()) {
			for (String e : l)
				d.add(new TextField(LUCENE_ARTICLE_LIST, e, Field.Store.NO));
		}
		Template t = a.getInfobox();
		d.add(new TextField(LUCENE_ARTICLE_INFOBOX, t.getName(),
				Field.Store.YES));
		for (String e : t.getDescription()) {
			d.add(new TextField(LUCENE_ARTICLE_INFOBOX, e, Field.Store.YES));
		}
		for (String e : a.getHighlights()) {
			d.add(new Field(LUCENE_ARTICLE_EMPH, e, STORE_TERM_VECTORS));
		}
		for (String e : a.getSections()) {
			d.add(new TextField(LUCENE_ARTICLE_SECTIONS, e, Field.Store.NO));
		}

		for (Link e : a.getLinks()) {
			d.add(new Field(LUCENE_ARTICLE_DESCRIPTIONS, cleaner.clean(e
					.getDescription()), STORE_TERM_VECTORS));
			d.add(new Field(LUCENE_ARTICLE_LINKS, cleaner.clean(e.getCleanId()
					.replace('_', ' ')), STORE_TERM_VECTORS));

		}

		d.add(new Field(LUCENE_ARTICLE_CONTENT, cleaner.clean(a.getText()),
				STORE_TERM_VECTORS));
		d.add(new Field(LUCENE_ARTICLE_SUMMARY, summarizer.getSummary(a),
				STORE_TERM_VECTORS));
		return d;
	}

	/**
	 * Indexes a Wikipedia Article
	 * 
	 * @param a
	 *            the article to index
	 */
	public void addDocument(Article a) {
		writer = getWriter();
		logger.debug("add doc {} ", a.getTitle());
		try {
			writer.addDocument(toLuceneDocument(a));
			// writer.addDocument(doc);
		} catch (Exception e) {
			logger.error("exception indexing a document: {} ({})",
					a.getTitle(), e.toString());
			e.printStackTrace();
			System.exit(1);
		}
		logger.debug("added doc {}", a.getWid());
	}

	/**
	 * Adds a Wikipedia Article (added just for testing)
	 * 
	 * @param id
	 *            - the id of the Wikipedia Article
	 * @param content
	 *            - the text of the Wikipedia Article
	 */
	protected void addDocument(int id, String content) {
		Article a = new Article();
		a.setWid(id);
		a.setParagraphs(Arrays.asList(content));
		addDocument(a);
	}

	/**
	 * Clears the index
	 */
	public void clearIndex() {
		logger.info("delete all the documents indexed");
		try {
			writer.deleteAll();
			writer.commit();
		} catch (IOException e) {
			logger.error("deleting the index: {}", e.toString());
			System.exit(1);
		}
	}

	public void commit() {
		try {
			writer.commit();
			// logger.info("commited, index contains {} documents", writer
			// .getReader().numDocs());
		} catch (Exception e) {
			logger.error("committing: {}", e.toString());
			System.exit(1);
		}

	}

	private Document getDoc(int wikiId) {
		IndexReader reader = getReader();

		// System.out.println("get docId "+pos);
		if (wikiId <= 0)
			return null;
		int docId = getLuceneId(wikiId);
		if (docId < 0) {
			logger.warn("no id for wikiId {}", wikiId);

			return null;
		}
		logger.debug("get wikiId {}  ->  docId {}", wikiId, docId);
		Document doc = null;
		try {
			doc = reader.document(docId);
		} catch (Exception e) {
			logger.error("retrieving doc in position {} {}", docId,
					e.toString());
			System.exit(-1);
		}

		return doc;
	}

	/**
	 * @param query
	 *            - a query
	 * @param field
	 *            - the field where to search the query
	 * @return number of documents containing the text in query in the given
	 *         fields
	 */
	public int getFreq(String query, String field) {
		Query q = null;
		searcher = getSearcher();
		TopScoreDocCollector collector = TopScoreDocCollector.create(1, true);

		// try {

		Text t = new Text(query).disableStopwords();
		PhraseQuery pq = new PhraseQuery();
		int i = 0;
		for (String term : t.getTerms()) {
			pq.add(new Term(field, term), i++);
		}
		q = pq;
		logger.debug(q.toString());
		// } catch (ParseException e) {
		// logger.error("querying the index: {} ", e.toString());
		// return -1;
		// }
		try {
			searcher.search(q, collector);
		} catch (IOException e) {
			logger.error("querying the index: {} ", e.toString());
			return -1;
		}
		return collector.getTotalHits();
	}

	/**
	 * @param query
	 *            - a query
	 * @param field
	 *            - the field on which to perform the query
	 * @return number of documents containing the text in query in the given
	 *         fields
	 */
	public int getFreq(String query) {
		return getFreq(query, LUCENE_ARTICLE_DEFAULT_FIELD);
	}

	public int getFreqFromSummary(String query) {
		return getFreq(query, LUCENE_ARTICLE_SUMMARY);
	}

	private IndexWriter getWriter() {
		if (writer == null)
			try {
				writer = new IndexWriter(index, config);
			} catch (CorruptIndexException e1) {
				logger.error("creating the index: {}", e1.toString());
				System.exit(-1);
			} catch (LockObtainFailedException e1) {
				logger.error("creating the index: {}", e1.toString());
				System.exit(-1);
			} catch (IOException e1) {
				logger.error("creating the index: {}", e1.toString());
				System.exit(-1);
			}
		return writer;
	}

	/**
	 * @return the number of documents indexed
	 */
	public int numDocs() {
		IndexReader reader = getReader();

		return reader.numDocs();

	}

	public void closeWriter() {
		try {
			writer.close();
		} catch (IOException e) {
			logger.error("closing the writer: {}", e.toString());
			System.exit(-1);
		}
	}

	/**
	 * @param query
	 *            entities containing the text of the query as a phrase (terms
	 *            consecutive) will be be returned.
	 * @param field
	 *            the field where the query must be performed (summary, content,
	 *            title ..).
	 * @param n
	 *            the max number of results to produce.
	 * @return the top wiki-id matching the query
	 */
	public List<Integer> query(String query, String field, int n) {
		searcher = getSearcher();
		TopScoreDocCollector collector = TopScoreDocCollector.create(n, true);
		List<Integer> results = new ArrayList<Integer>();
		Query q = null;

		try {
			q = new QueryParser(Version.LUCENE_41, field, new StandardAnalyzer(
					Version.LUCENE_41)).parse("\"" + query + "\"");
		} catch (ParseException e) {
			logger.error("querying the index: {} ", e.toString());
			return results;
		}

		try {
			searcher.search(q, collector);
		} catch (IOException e) {
			logger.error("querying the index: {} ", e.toString());
			return results;
		}

		ScoreDoc[] hits = collector.topDocs().scoreDocs;
		for (int i = 0; i < hits.length; ++i) {
			int docId = hits[i].doc;
			results.add(getWikiId(docId));
		}

		logger.debug("query {} docs {}", query, results);
		return results;
	}

	/**
	 * @param query
	 *            entities containing the text of the query as a phrase (terms
	 *            consecutive) will be be returned.
	 * @param field
	 *            the field where the query must be performed (summary, content,
	 *            title ..).
	 * @param n
	 *            the max number of results to produce.
	 * @return the top wiki-id matching the query (returns max 10000 wiki-id );
	 */
	public List<Integer> query(String query, String field) {
		return query(query, field, 10000);
	}

	/**
	 * @return the top Lucene Document ids matching the query
	 */
	public List<Integer> query(String query) {
		return query(query, LUCENE_ARTICLE_DEFAULT_FIELD);
	}

	/**
	 * Retrieves an article from the index
	 * 
	 * @param id
	 *            - the Wikipedia Id of the Article
	 * @return the document from the index
	 */
	public Article getArticle(int id) {
		Article a = new Article();
		a.setWikiId(id);

		Document d = getDoc(id);
		if (d != null) {
			List<String> paragraphs = new ArrayList<String>();
			paragraphs.add(d.getField(LUCENE_ARTICLE_CONTENT).stringValue());
			a.setTitle(d.getField(LUCENE_ARTICLE_TITLE).stringValue());
			a.setWikiTitle(d.getField(LUCENE_ARTICLE_WIKI_TITLE).stringValue());
			a.setSummary(d.getField(LUCENE_ARTICLE_SUMMARY).stringValue());

			a.setParagraphs(paragraphs);
		}
		//
		return a;

	}

	/**
	 * Retrieves only the article summary and the title from the index
	 * 
	 * @param id
	 *            - the Wikipedia Id of the Article
	 * @return the document from the index
	 */
	public Article getArticleSummary(int id) {
		Article a = new Article();
		a.setWikiId(id);

		Document d = getDoc(id);
		if (d != null) {
			a.setWikiTitle(d.getField(LUCENE_ARTICLE_WIKI_TITLE).stringValue());
			a.setTitle(d.getField(LUCENE_ARTICLE_TITLE).stringValue());
			a.setSummary(d.getField(LUCENE_ARTICLE_SUMMARY).stringValue());
		}
		//
		return a;

	}

	public int getWikiId(int luceneId) {
		IndexReader reader = getReader();

		// System.out.println("get docId "+pos);

		Document doc = null;
		try {
			doc = reader.document(luceneId);
		} catch (Exception e) {
			logger.error("retrieving doc in position {} {}", luceneId,
					e.toString());
			System.exit(-1);
		}
		return Integer.parseInt(doc.get(LUCENE_ARTICLE_ID));
	}

	/**
	 * 
	 * Sorts a list of entities by their similarity with the string context.
	 * 
	 * @param spot
	 *            - the spot for which the entities are sorted
	 * @param eml
	 *            - the entity list to sort
	 * @param context
	 *            - the context text, entities are sorted based on their
	 *            similarity with the context.
	 * @param field
	 *            - sort the entity based on the similarity between their text
	 *            in this field and the context.
	 * 
	 */
	@SuppressWarnings("null")
	public void rankBySimilarity(SpotMatch spot, EntityMatchList eml,
			String context, String field) {

		if (context.trim().isEmpty()) {
			logger.warn("no context for spot {}", spot.getMention());
			return;
		}

		Query q = null;

		try {
			// removing all not alphanumerical chars
			context = context.replaceAll("[^A-Za-z0-9 ]", " ");

			q = new QueryParser(Version.LUCENE_41, "content",
					new StandardAnalyzer(Version.LUCENE_41)).parse(QueryParser
					.escape(context));
		} catch (ParseException e) {
			logger.error("querying the index: {} ", e.toString());
			logger.error("clauses = {} ",
					((BooleanQuery) q).getClauses().length);
			return;
		}

		for (EntityMatch e : eml) {
			Integer luceneId = getLuceneId(e.getId());
			float score = 0.5f;
			// smoothing
			if (luceneId == null || luceneId < 0) {
				// logger.warn("no docs in lucene for wiki id {}, ignoring",
				// e.id());
			} else {
				score += getSimilarity(q, e.getId());

			}
			e.setScore(score);
		}

		return;

	}

	/**
	 * 
	 * Sorts a list of entities by their similarity (full text) with the string
	 * context.
	 * 
	 * @param spot
	 *            - the spot for which the entities are sorted
	 * @param eml
	 *            - the entity list to sort
	 * @param context
	 *            - the context text, entities are sorted based on their
	 *            similarity with the context.
	 * 
	 * 
	 */
	public void rankBySimilarity(SpotMatch spot, EntityMatchList eml,
			String context) {
		rankBySimilarity(spot, eml, context, LUCENE_ARTICLE_DEFAULT_FIELD);
		return;

	}

}