/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Contributors: * Apache Software Foundation - org.apache.lucene.demo.IndexFiles * Australian National University - adaptation to DaCapo test harness */ package org.dacapo.luindex; /** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.nio.file.Paths; import java.io.FileReader; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.DateTools; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; /** * date: $Date: 2009-12-24 11:19:36 +1100 (Thu, 24 Dec 2009) $ * id: $Id: Index.java 738 2009-12-24 00:19:36Z steveb-oss $ */ public class Index { private final File scratch; public Index(File scratch) { this.scratch = scratch; } /** * Index all text files under a directory. */ public void main(final File INDEX_DIR, final String[] args) throws IOException { IndexWriterConfig IWConfig = new IndexWriterConfig(); IWConfig.setOpenMode (IndexWriterConfig.OpenMode.CREATE); IWConfig.setMergePolicy (new LogByteSizeMergePolicy()); IndexWriter writer = new IndexWriter(FSDirectory.open(Paths.get(INDEX_DIR.getCanonicalPath())), IWConfig); for (int arg = 0; arg < args.length; arg++) { final File docDir = new File(args[arg]); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); throw new IOException("Cannot read from document directory"); } indexDocs(writer, docDir); System.out.println("Optimizing..."); writer.forceMerge(1); } writer.close(); } /** * Index either a file or a directory tree. * * @param writer * @param file * @throws IOException */ void indexDocs(IndexWriter writer, File file) throws IOException { /* Strip the absolute part of the path name from file name output */ int scratchP = scratch.getCanonicalPath().length() + 1; /* do not try to index files that cannot be read */ if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { Arrays.sort(files); for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { System.out.println("adding " + file.getCanonicalPath().substring(scratchP)); try { Document doc = new Document(); FieldType docFT = new FieldType(); docFT.setTokenized (false); docFT.setStored (true); docFT.setIndexOptions (IndexOptions.DOCS); // Add the path of the file as a field named "path". Use a field that is // indexed (i.e. searchable), but don't tokenize the field into words. doc.add(new Field("path", file.getPath(), docFT)); // Add the last modified date of the file a field named "modified". Use // a field that is indexed (i.e. searchable), but don't tokenize the field // into words. doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MINUTE), docFT)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in the system's default encoding. // If that's not the case searching for special characters will fail. docFT.setTokenized (true); docFT.setStored (false); doc.add(new Field("contents", new FileReader(file), docFT)); writer.addDocument(doc); } // at least on windows, some temporary files raise this exception with // an "access denied" message // checking if the file can be read doesn't help catch (FileNotFoundException fnfe) { } } } } }