package org.aksw.agdistis.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.Version; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.RDFParser; import org.openrdf.rio.helpers.RDFHandlerBase; import org.openrdf.rio.turtle.TurtleParser; import org.slf4j.LoggerFactory; import info.aduna.io.FileUtil; public class TripleIndexCreator { private static org.slf4j.Logger log = LoggerFactory.getLogger(TripleIndexCreator.class); public static final String N_TRIPLES = "NTriples"; public static final String TTL = "ttl"; public static final String TSV = "tsv"; public static final Version LUCENE_VERSION = Version.LUCENE_44; private Analyzer urlAnalyzer; private Analyzer literalAnalyzer; private DirectoryReader ireader; private IndexWriter iwriter; private MMapDirectory directory; public static void main(String args[]) { if (args.length > 0) { log.error("TripleIndexCreator works without parameters. Please use agdistis.properties File"); return; } try { log.info("For using DBpedia we suggest you downlaod the following file: " + "labels_<LANG>.ttl, " + "redirects_transitive_<LANG>.ttl, " + "instance_types_<LANG>.ttl, " + "mappingbased_properties_<LANG>.ttl, " + "specific_mappingbased_properties_<LANG>.ttl," + "disambiguations_<LANG>.ttl." + "" + "Please download them into one folder and configure it in the agdistis.properties File." + "For further information have a look at our wiki: https://github.com/AKSW/AGDISTIS/wiki"); Properties prop = new Properties(); InputStream input = new FileInputStream("src/main/resources/config/agdistis.properties"); prop.load(input); String envIndex = System.getenv("AGDISTIS_INDEX"); String index = envIndex != null ? envIndex : prop.getProperty("index"); log.info("The index will be here: " + index); String envFolderWithTtlFiles = System.getenv("AGDISTIS_FOLDER_WITH_TTL_FILES"); String folder = envFolderWithTtlFiles != null ? envFolderWithTtlFiles : prop.getProperty("folderWithTTLFiles"); log.info("Getting triple data from: " + folder); List<File> listOfFiles = new ArrayList<File>(); for (File file : new File(folder).listFiles()) { if (file.getName().endsWith("ttl")) { listOfFiles.add(file); } } String envSurfaceFormTsv = System.getenv("AGDISTIS_SURFACE_FORM_TSV"); String surfaceFormTSV = envSurfaceFormTsv != null ? envSurfaceFormTsv : prop.getProperty("surfaceFormTSV"); log.info("Getting surface forms from: " + surfaceFormTSV); File file = new File(surfaceFormTSV); if (file.exists()) { listOfFiles.add(file); } String envBaseUri = System.getenv("AGDISTIS_BASE_URI"); String baseURI = envBaseUri != null ? envBaseUri : prop.getProperty("baseURI"); log.info("Setting Base URI to: " + baseURI); TripleIndexCreator ic = new TripleIndexCreator(); ic.createIndex(listOfFiles, index, baseURI); ic.close(); } catch (IOException e) { log.error("Error while creating index. Maybe the index is corrupt now.", e); } } public void createIndex(List<File> files, String idxDirectory, String baseURI) { try { urlAnalyzer = new SimpleAnalyzer(LUCENE_VERSION); literalAnalyzer = new LiteralAnalyzer(LUCENE_VERSION); Map<String, Analyzer> mapping = new HashMap<String, Analyzer>(); mapping.put(TripleIndex.FIELD_NAME_SUBJECT, urlAnalyzer); mapping.put(TripleIndex.FIELD_NAME_PREDICATE, urlAnalyzer); mapping.put(TripleIndex.FIELD_NAME_OBJECT_URI, urlAnalyzer); mapping.put(TripleIndex.FIELD_NAME_OBJECT_LITERAL, literalAnalyzer); PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(urlAnalyzer, mapping); File indexDirectory = new File(idxDirectory); indexDirectory.mkdir(); directory = new MMapDirectory(indexDirectory); IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, perFieldAnalyzer); iwriter = new IndexWriter(directory, config); iwriter.commit(); for (File file : files) { String type = FileUtil.getFileExtension(file.getName()); if (type.equals(TTL)) indexTTLFile(file, baseURI); if (type.equals(TSV)) indexTSVFile(file); iwriter.commit(); } iwriter.close(); ireader = DirectoryReader.open(directory); } catch (Exception e) { log.error("Error while creating TripleIndex.", e); } } private void indexTTLFile(File file, String baseURI) throws RDFParseException, RDFHandlerException, FileNotFoundException, IOException { log.info("Start parsing: " + file); RDFParser parser = new TurtleParser(); OnlineStatementHandler osh = new OnlineStatementHandler(); parser.setRDFHandler(osh); parser.setStopAtFirstError(false); if (baseURI == null) { parser.parse(new FileReader(file), ""); } else { parser.parse(new FileReader(file), baseURI); } log.info("Finished parsing: " + file); } private void indexTSVFile(File file) throws IOException { log.info("Start parsing: " + file); BufferedReader br = new BufferedReader(new FileReader(file)); while (br.ready()) { String[] line = br.readLine().split("\t"); String subject = line[0]; for (int i = 1; i < line.length; ++i) { String object = line[i]; Document doc = new Document(); doc.add(new StringField(TripleIndex.FIELD_NAME_SUBJECT, subject, Store.YES)); doc.add(new StringField(TripleIndex.FIELD_NAME_PREDICATE, "http://www.w3.org/2004/02/skos/core#altLabel", Store.YES)); doc.add(new TextField(TripleIndex.FIELD_NAME_OBJECT_LITERAL, object, Store.YES)); iwriter.addDocument(doc); } } br.close(); log.info("Finished parsing: " + file); } private void addDocumentToIndex(IndexWriter iwriter, String subject, String predicate, String object, boolean isUri) throws IOException { Document doc = new Document(); log.debug(subject + " " + predicate + " " + object); doc.add(new StringField(TripleIndex.FIELD_NAME_SUBJECT, subject, Store.YES)); doc.add(new StringField(TripleIndex.FIELD_NAME_PREDICATE, predicate, Store.YES)); if (isUri) { doc.add(new StringField(TripleIndex.FIELD_NAME_OBJECT_URI, object, Store.YES)); } else { doc.add(new TextField(TripleIndex.FIELD_NAME_OBJECT_LITERAL, object, Store.YES)); } iwriter.addDocument(doc); } public void close() throws IOException { if (ireader != null) { ireader.close(); } if (directory != null) { directory.close(); } } private class OnlineStatementHandler extends RDFHandlerBase { @Override public void handleStatement(Statement st) { String subject = st.getSubject().stringValue(); String predicate = st.getPredicate().stringValue(); String object = st.getObject().stringValue(); try { addDocumentToIndex(iwriter, subject, predicate, object, st.getObject() instanceof URI); } catch (IOException e) { e.printStackTrace(); } } } }