/** Copyright 2008, Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **/ package pitt.search.lucene; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.TextField; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.analysis.standard.StandardAnalyzer; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.nio.file.FileSystem; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.util.Date; import static pitt.search.semanticvectors.LuceneUtils.LUCENE_VERSION; /** * Index pairs of bilingual texts in a parallel corpus. See <a * href="http://code.google.com/p/semanticvectors/wiki/BilingualModels"> * http://code.google.com/p/semanticvectors/wiki/BilingualModels</a> * for more thorough documentation of preparation of corpora and * creation of models. */ public class IndexBilingualFiles { static Path INDEX_DIR = FileSystems.getDefault().getPath("bilingual_index"); String LANGUAGE1; String LANGUAGE2; public IndexBilingualFiles(String lang1, String lang2) { LANGUAGE1 = lang1; LANGUAGE2 = lang2; } private void runIndexer() { if (Files.exists(INDEX_DIR)) { throw new IllegalArgumentException( "Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); } Date start = new Date(); try { final File docDir1 = new File(LANGUAGE1); final File docDir2 = new File(LANGUAGE2); IndexWriterConfig writerConfig = new IndexWriterConfig(new StandardAnalyzer()); IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), writerConfig); System.out.println("Indexing to directory '" + INDEX_DIR + "'..."); runDeepIndexer(docDir1, docDir2, writer); System.out.println("Optimizing..."); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } private void runDeepIndexer(File docDir1, File docDir2, IndexWriter writer) { // Run several tests to see if corpus is well formed. if (!docDir1.exists()) { System.out.println("Test directory exists failed: " + docDir1); System.exit(1); if (!docDir1.canRead()) { System.out.println("Test readable failed: " + docDir1); System.exit(1); if (!docDir1.isDirectory()) { System.out.println("Test is directory failed: " + docDir1); System.exit(1); } } } if (!docDir2.exists()) { System.out.println("Test directory exists failed: " + docDir2); System.exit(1); if (!docDir2.canRead()) { System.out.println("Test readable failed: " + docDir2); System.exit(1); if (!docDir2.isDirectory()) { System.out.println("Test is directory failed: " + docDir2); System.exit(1); } } } System.err.println("Trying to index files in directories:\n" + docDir1.getAbsolutePath() + "\n" + docDir2.getAbsolutePath()); String[] files1 = docDir1.list(); String[] files2 = docDir2.list(); if (!checkStringArraysEqual(files1, files2)) { System.err.println("Contents of directories don't match up; " + "not creating bilingual index.\n" + "Please check corpora contents, clean up your data, " + "and try again."); //System.exit(1); } for (int i = 0; i < files1.length; ++i) { System.out.println("adding " + files1[i]); File newFile1 = new File(docDir1 + "/" + files1[i]); File newFile2 = new File(docDir2 + "/" + files1[i]); if (newFile1.isDirectory() && newFile2.isDirectory()) { runDeepIndexer(newFile1, newFile2, writer); } try { writer.addDocument(fileBilingualDocument(newFile1, newFile2)); } catch (IOException e) { System.err.println("Got exception with filepair: " + files1[i]); e.printStackTrace(); } } } // A method for making Lucene Documents from a bilingual file pair. protected Document fileBilingualDocument(File file1, File file2) throws java.io.IOException { /** Makes a document for a File. <p> The document has three fields: <ul> <li><code>filename</code>--name of the file, as a stored, untokenized field; to get the full path for each pair, add the language specific prefix. <li><code>contents_LANGUAGE1</code>--containing the full contents of the file in LANGUAGE1, as a Reader field; e.g., contents_en. <li><code>contents_LANGUAGE2</code>--containing the full contents of the file in LANGUAGE2, as a Reader field; e.g., contents_fr. */ // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "filename". Use a field that is // indexed (i.e. searchable), but don't tokenize the field into words. doc.add(new StoredField("filename", file1.getPath())); // Add the contents of the file to a fields named // "contents_LANGUAGE1" and "contents_LANGUAGE2". Specify a // Reader, so that the text of the file is tokenized and // indexed, but not stored. Note that FileReader expects the // file to be in the system's default encoding. If that's not // the case searching for special characters will fail. doc.add(new TextField("contents_" + LANGUAGE1, new FileReader(file1))); doc.add(new TextField("contents_" + LANGUAGE2, new FileReader(file2))); // return the document return doc; } // Utility for checking if two lists of filenames are the same. static boolean checkStringArraysEqual(String[] array1, String[] array2) { if (array1.length != array2.length) { System.err.println("checkStringArraysEqual: arrays are of different lengths!"); return false; } if (array1.length == 0) { System.err.println("checkStringArraysEqual: arrays are empty!"); return false; } for (int i = 0; i < array1.length; ++i) { if (!array1[i].equals(array2[i])) { System.err.println("checkStringArraysEqual: following pairs differ: " + array1[i] + " " + array2[i]); return false; } } return true; } // Main function. public static void main(String[] args) { String usage = "java pitt.search.preporcessing.IndexBilingualFiles <directory1> <directory2>"; if (args.length != 2) { System.err.println("Usage: " + usage); System.exit(1); } IndexBilingualFiles indexer = new IndexBilingualFiles(args[0], args[1]); indexer.runIndexer(); } }