java source code of TripleIndexCreatorContext

package org.aksw.agdistis.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.jena.query.Query;
import org.apache.jena.query.QueryExecutionFactory;
import org.apache.jena.query.QueryFactory;
import org.apache.jena.query.QuerySolution;
import org.apache.jena.query.ResultSet;
import org.apache.jena.query.Syntax;
import org.apache.jena.riot.WebContent;
import org.apache.jena.sparql.engine.http.QueryEngineHTTP;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.helpers.RDFHandlerBase;
import org.openrdf.rio.turtle.TurtleParser;
import org.slf4j.LoggerFactory;

import info.aduna.io.FileUtil;

public class TripleIndexCreatorContext {

	private static org.slf4j.Logger log = LoggerFactory.getLogger(TripleIndexCreatorContext.class);

	public static final String N_TRIPLES = "NTriples";
	public static final String TTL = "ttl";
	public static final Version LUCENE_VERSION = Version.LUCENE_44;

	private static Analyzer urlAnalyzer;
	private static Analyzer literalAnalyzer;
	private static DirectoryReader ireader;
	private static IndexWriter iwriter;
	private static MMapDirectory directory;
	private static IndexSearcher isearcher;
	private static String nodeType;
	private static String baseURI;
	private static String endpoint;
	public static final String FIELD_NAME_CONTEXT = "CONTEXT";
	public static final String FIELD_NAME_SURFACE_FORM = "SURFACE_FORM";
	public static final String FIELD_NAME_URI = "URI";
	public static final String FIELD_NAME_URI_COUNT = "URI_COUNT";

	public static void main(String args[]) {
		if (args.length > 0) {
			log.error("TripleIndexCreator works without parameters. Please use agdistis.properties File");
			return;
		}
		try {
			log.info("For using DBpedia we suggest you downlaod the following file: " + "labels_<LANG>.ttl, "
					+ "redirects_transitive_<LANG>.ttl, " + "instance_types_<LANG>.ttl, "
					+ "mappingbased_properties_<LANG>.ttl, " + "specific_mappingbased_properties_<LANG>.ttl,"
					+ "disambiguations_<LANG>.ttl." + ""
					+ "Please download them into one folder and configure it in the agdistis.properties File."
					+ "For further information have a look at our wiki: https://github.com/AKSW/AGDISTIS/wiki");

			Properties prop = new Properties();
			InputStream input = new FileInputStream("src/main/resources/config/agdistis.properties");
			prop.load(input);

			String envIndex = System.getenv("AGDISTIS_INDEX_BY_CONTEXT");
			String index = envIndex != null ? envIndex : prop.getProperty("index_bycontext");
			log.info("The index will be here: " + index);
			String envNodeType = System.getenv("AGDISTIS_NODE_TYPE");
			nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType");
			String envFolderWithTtlFiles = System.getenv("AGDISTIS_FOLDER_WITH_TTL_FILES");
			String folder = envFolderWithTtlFiles != null ? envFolderWithTtlFiles : prop.getProperty("folderWithTTLFiles");
			log.info("Getting triple data from: " + folder);
			List<File> listOfFiles = new ArrayList<File>();
			for (File file : new File(folder).listFiles()) {
				if (file.getName().endsWith("ttl")) {
					listOfFiles.add(file);
				}
			}

			String folderUpdate = folder + "/update/";
			log.info("Getting triple data from: " + folderUpdate);
			List<File> listOfFiles2 = new ArrayList<File>();
			for (File file : new File(folderUpdate).listFiles()) {
				if (file.getName().endsWith("ttl")) {
					listOfFiles2.add(file);
				}
			}

			String envBaseUri = System.getenv("AGDISTIS_BASE_URI");
			baseURI = envBaseUri != null ? envBaseUri : prop.getProperty("baseURI");
			log.info("Setting Base URI to: " + baseURI);

			String envEndpoint = System.getenv("AGDISTIS_ENDPOINT");
			endpoint = envEndpoint != null ? envEndpoint : prop.getProperty("endpoint");
			log.info("Setting Endpoint to: " + baseURI);

			TripleIndexCreatorContext ic = new TripleIndexCreatorContext();
			ic.createIndex(listOfFiles, index, baseURI);
			ireader = DirectoryReader.open(directory);
			isearcher = new IndexSearcher(ireader);
			ic.updateIndex(listOfFiles2, baseURI, endpoint);
			ic.close();
			log.info("Finished");
		} catch (IOException e) {
			log.error("Error while creating index. Maybe the index is corrupt now.", e);
		}
	}

	public void createIndex(List<File> files, String idxDirectory, String baseURI) {
		try {
			urlAnalyzer = new SimpleAnalyzer(LUCENE_VERSION);
			literalAnalyzer = new LiteralAnalyzer(LUCENE_VERSION);
			Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
			mapping.put(FIELD_NAME_URI, urlAnalyzer);
			mapping.put(FIELD_NAME_SURFACE_FORM, literalAnalyzer);
			mapping.put(FIELD_NAME_URI_COUNT, literalAnalyzer);
			mapping.put(FIELD_NAME_CONTEXT, literalAnalyzer);
			PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(urlAnalyzer, mapping);

			File indexDirectory = new File(idxDirectory);
			indexDirectory.mkdir();
			directory = new MMapDirectory(indexDirectory);
			IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, perFieldAnalyzer);
			iwriter = new IndexWriter(directory, config);
			iwriter.commit();
			for (File file : files) {
				String type = FileUtil.getFileExtension(file.getName());
				if (type.equals(TTL))
					indexTTLFile(file, baseURI);
				iwriter.commit();
			}
		} catch (Exception e) {
			log.error("Error while creating TripleIndex.", e);
		}
	}

	private void indexTTLFile(File file, String baseURI)
			throws RDFParseException, RDFHandlerException, FileNotFoundException, IOException {
		log.info("Start parsing: " + file);
		RDFParser parser = new TurtleParser();
		OnlineStatementHandler osh = new OnlineStatementHandler();
		parser.setRDFHandler(osh);
		parser.setStopAtFirstError(false);
		parser.parse(new FileReader(file), baseURI);
		log.info("Finished parsing: " + file);
	}

	private class OnlineStatementHandler extends RDFHandlerBase {
		@Override
		public void handleStatement(Statement st) {
			String subject = st.getSubject().stringValue();
			String predicate = st.getPredicate().stringValue();
			String object = st.getObject().stringValue();
			try {
				addDocumentToIndex(subject, predicate, object, st.getObject() instanceof URI);
				iwriter.commit();
				ireader = DirectoryReader.open(directory);
				isearcher = new IndexSearcher(ireader);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

	private void addDocumentToIndex(String subject, String predicate, String object, boolean isUri) throws IOException {
		log.info("here again");
		List<Triple> triples = new ArrayList<>();

		try {
			triples = search(subject, null, null, 100);
		} catch (Exception e) {
		}
		if (triples.size() == 0) {
			Document doc = new Document();
			log.debug(subject + " " + predicate + " " + object);
			doc.add(new StringField(FIELD_NAME_URI, subject, Store.YES));
			doc.add(new TextField(FIELD_NAME_SURFACE_FORM, object, Store.YES));
			doc.add(new TextField(FIELD_NAME_URI_COUNT, "1", Store.YES));
			doc.add(new TextField(FIELD_NAME_CONTEXT, object, Store.YES));
			iwriter.addDocument(doc);
		} else {
			String docID = triples.get(0).subject;
			log.info(triples.toString());
			if (isUri) {
				if (endpoint.isEmpty()) {
					log.info("endpoint empty");
					object = object.replace(nodeType, "");
				} else {
					object = sparql(subject);
					log.info("endpoint working");
				}
			}
			String remainContext = triples.get(0).object.concat(" " + object);
			log.info(remainContext);
			Document hitDoc = isearcher.doc(Integer.parseInt(docID));
			Document newDoc = new Document();
			newDoc.add(new StringField(FIELD_NAME_URI, triples.get(0).predicate, Store.YES));
			newDoc.add(new TextField(FIELD_NAME_SURFACE_FORM, hitDoc.get(FIELD_NAME_SURFACE_FORM), Store.YES));
			newDoc.add(new TextField(FIELD_NAME_URI_COUNT, "1", Store.YES));
			newDoc.add(new TextField(FIELD_NAME_CONTEXT, remainContext, Store.YES));
			iwriter.updateDocument(new Term(FIELD_NAME_URI, subject), newDoc);
		}

	}

	public void updateIndex(List<File> files, String baseURI, String endpoint) {
		log.info("UpdateIndexBegin");
		try {
			for (File file : files) {
				String type = FileUtil.getFileExtension(file.getName());
				if (type.equals(TTL))
					indexTTLFile(file, baseURI);
				iwriter.commit();
			}
		} catch (Exception e) {
			log.error("Error while creating TripleIndex.", e);
		}

	}

	public List<Triple> search(String subject, String predicate, String object, int maxNumberOfResults) {
		BooleanQuery bq = new BooleanQuery();
		List<Triple> triples = new ArrayList<Triple>();
		try {
			if (subject != null && subject.equals("http://aksw.org/notInWiki")) {
				log.error(
						"A subject 'http://aksw.org/notInWiki' is searched in the index. That is strange and should not happen");
			}
			if (subject != null) {
				TermQuery tq = new TermQuery(new Term(FIELD_NAME_URI, subject));
				bq.add(tq, BooleanClause.Occur.MUST);
			}
			triples = getFromIndex(maxNumberOfResults, bq);
			if (triples == null) {
				return new ArrayList<Triple>();
			}

		} catch (Exception e) {
			log.error(e.getLocalizedMessage() + " -> " + subject);

		}
		return triples;
	}

	private List<Triple> getFromIndex(int maxNumberOfResults, BooleanQuery bq) throws IOException {
		// log.debug("\t start asking index...");

		try {
			ScoreDoc[] hits = isearcher.search(bq, null, maxNumberOfResults).scoreDocs;
			List<Triple> triples = new ArrayList<Triple>();
			String s, p, o;
			for (int i = 0; i < hits.length; i++) {
				Document hitDoc = isearcher.doc(hits[i].doc);
				s = String.valueOf(hits[i].doc);
				p = hitDoc.get(FIELD_NAME_URI);
				o = hitDoc.get(FIELD_NAME_CONTEXT);
				Triple triple = new Triple(s, p, o);
				triples.add(triple);
			}
			log.debug("\t finished asking index...");
			hits = null;
			return triples;
		} catch (Exception e) {
			return null;
		}
	}

	public String sparql(String subject) {

		// First query takes the most specific class from a given resource.
		String ontology_service = endpoint;

		String endpointsSparql = "select ?label where {<" + subject
				+ "> <http://www.w3.org/2000/01/rdf-schema#label> ?label FILTER (lang(?label) = 'en')} LIMIT 100";

		Query sparqlQuery = QueryFactory.create(endpointsSparql, Syntax.syntaxARQ);
		QueryEngineHTTP qexec = (QueryEngineHTTP) QueryExecutionFactory.sparqlService(ontology_service, sparqlQuery);
		qexec.setModelContentType(WebContent.contentTypeRDFXML);
		ResultSet results = qexec.execSelect();
		String property = null;
		while (results.hasNext()) {
			QuerySolution qs = results.next();
			property = qs.getLiteral("?label").getLexicalForm();
		}
		return property;

	}

	public void close() throws IOException {

		if (iwriter != null) {
			iwriter.close();
		}

		if (ireader != null) {
			ireader.close();
		}
		if (directory != null) {
			directory.close();
		}
	}
}