package pitt.search.lucene; import static pitt.search.semanticvectors.LuceneUtils.LUCENE_VERSION; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; import pitt.search.semanticvectors.FlagConfig; import pitt.search.semanticvectors.utils.VerbatimLogger; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; /** * This class takes as input a single text file with each line following the format * <subject>\t<predicate>\t<object> and produces a Lucene index, in which each * "document" is a single tab-delimited predication (or triple) with the fields subject, * predicate, and object. */ public class LuceneIndexFromTriples { private LuceneIndexFromTriples() {} static Path INDEX_DIR = FileSystems.getDefault().getPath("predication_index"); /** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java pitt.search.lucene.LuceneIndexFromTriples [triples text file] "; if (args.length == 0) { System.err.println("Usage: " + usage); System.exit(1); } FlagConfig flagConfig = FlagConfig.getFlagConfig(args); // Allow for the specification of a directory to write the index to. if (flagConfig.luceneindexpath().length() > 0) { INDEX_DIR = FileSystems.getDefault().getPath(flagConfig.luceneindexpath()); } if (Files.exists(INDEX_DIR)) { throw new IllegalArgumentException( "Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); } try { // Create IndexWriter using WhiteSpaceAnalyzer without any stopword list. IndexWriterConfig writerConfig = new IndexWriterConfig(new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), writerConfig); final File triplesTextFile = new File(args[0]); if (!triplesTextFile.exists() || !triplesTextFile.canRead()) { writer.close(); throw new IOException("Document file '" + triplesTextFile.getAbsolutePath() + "' does not exist or is not readable, please check the path"); } System.out.println("Indexing to directory '" +INDEX_DIR+ "'..."); indexDoc(writer, triplesTextFile); writer.close(); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } /** * This class indexes the file passed as a parameter, writing to the index passed as a parameter. * Each predication is indexed as an individual document, with the fields "subject", "predicate", and "object" * @throws IOException */ static void indexDoc(IndexWriter fsWriter, File triplesTextFile) throws IOException { BufferedReader theReader = new BufferedReader(new FileReader(triplesTextFile)); int linecnt = 0; String lineIn; while ((lineIn = theReader.readLine()) != null) { java.util.StringTokenizer theTokenizer = new java.util.StringTokenizer(lineIn,"\t"); // Output progress counter. if( ( ++linecnt % 10000 == 0 ) || ( linecnt < 10000 && linecnt % 1000 == 0 ) ){ VerbatimLogger.info((linecnt) + " ... "); } try { if (theTokenizer.countTokens() < 3) { VerbatimLogger.warning( "Line in predication file does not have three delimited fields: " + lineIn + "\n"); continue; } String subject = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_"); String predicate = theTokenizer.nextToken().trim().toUpperCase().replaceAll(" ", "_"); String object = theTokenizer.nextToken().trim().toLowerCase().replaceAll(" ", "_"); Document doc = new Document(); doc.add(new TextField("subject", subject, Field.Store.YES)); doc.add(new TextField("predicate", predicate, Field.Store.YES)); doc.add(new TextField("object", object, Field.Store.YES)); doc.add(new TextField("predication",subject+predicate+object, Field.Store.NO)); fsWriter.addDocument(doc); } catch (Exception e) { System.out.println(lineIn); e.printStackTrace(); } } VerbatimLogger.info("\n"); // Newline after line counter prints. theReader.close(); } }