/* * Copyright 2015 RONDHUIT Co.,LTD. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File import java.io.FileInputStream import java.io.InputStreamReader import java.io.BufferedReader import java.nio.file.FileSystems import org.apache.lucene.index._ import org.apache.lucene.search.TermQuery import org.nlp4l.core.analysis.Analyzer import org.nlp4l.core.analysis.AnalyzerBuilder import org.nlp4l.core._ import scalax.file.Path import scalax.file.PathSet val index = "/tmp/index-ceeaus-all" def lines(fl: Path, encoding: String): List[String] = { val is = new FileInputStream(fl.path) val r = new InputStreamReader(is, encoding) val br = new BufferedReader(r) var result: List[String] = Nil try{ var line = br.readLine() while(line != null){ result = result :+ line line = br.readLine() } result } finally{ br.close r.close is.close } } def document(fl: Path, ja: Boolean): Document = { val ps: Array[String] = fl.path.split(File.separator) // for Windows // val ps: Array[String] = file.path.split("\\\\") val file = ps(3) val typ = ps(2) val cat = "all" val encoding = if(ja) "sjis" else "UTF-8" val body = lines(fl, encoding) val body_set = if(ja) Set(Field("body_ja", body)) else Set(Field("body_en", body), Field("body_ws", body)) Document(Set( Field("file", file), Field("type", typ), Field("cat", cat)) ++ body_set ) } // delete existing Lucene index val p = Path(new File(index)) p.deleteRecursively() // write documents into an index val schema = SchemaLoader.loadFile("examples/schema/ceeaus.conf") val writer = IWriter(index, schema) val c: PathSet[Path] = Path("corpora", "CEEAUS", "PLAIN").children() // write English docs c.filter(e => e.name.indexOf("cjejus")<0 && e.name.endsWith(".txt")).toList.sorted.foreach(g => writer.write(document(g, false))) // write English docs c.filter(e => e.name.indexOf("cjejus")>=0 && e.name.endsWith(".txt")).toList.sorted.foreach(g => writer.write(document(g, true))) writer.close // search test val searcher = ISearcher(index) val results = searcher.search(query=new TermQuery(new Term("body_ja", "喫煙")), rows=10) results.foreach(doc => { printf("[DocID] %d: %s\n", doc.docId, doc.get("file")) }) // search test for ch4 val results2 = searcher.search(query=new TermQuery(new Term("body_ws", "still,")), rows=10) results2.foreach(doc => { printf("[DocID] %d: %s\n", doc.docId, doc.get("file")) })