/* * Copyright (c) 2014 RONDHUIT Co.,Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.rondhuit.w2v.lucene; import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import com.rondhuit.w2v.Config; import com.rondhuit.w2v.Corpus; public class LuceneIndexCorpus extends Corpus { private IndexReader reader; private final String field; private TopDocs topDocs; private final Analyzer analyzer; int tdPos; public LuceneIndexCorpus(Config config) throws IOException { super(config); LuceneIndexConfig liConfig = (LuceneIndexConfig)config; field = liConfig.getField(); analyzer = loadAnalyzer(liConfig.getAnalyzer()); Directory dir = FSDirectory.open(new File(liConfig.getIndexDir())); reader = DirectoryReader.open(dir); } static Analyzer loadAnalyzer(String fqcn){ try { return (Analyzer)Class.forName(fqcn).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } } public LuceneIndexCorpus(Corpus cloneSrc) throws IOException { super(cloneSrc); LuceneIndexCorpus lic = (LuceneIndexCorpus)cloneSrc; config = lic.config; reader = lic.reader; field = lic.field; topDocs = lic.topDocs; analyzer = loadAnalyzer(((LuceneIndexConfig)config).getAnalyzer()); } @Override public void learnVocab() throws IOException { super.learnVocab(); final String field = ((LuceneIndexConfig)config).getField(); final Terms terms = MultiFields.getTerms(reader, field); final BytesRef maxTerm = terms.getMax(); final BytesRef minTerm = terms.getMin(); Query q = new TermRangeQuery(field, minTerm, maxTerm, true, true); IndexSearcher searcher = new IndexSearcher(reader); topDocs = searcher.search(q, Integer.MAX_VALUE); TermsEnum termsEnum = null; termsEnum = terms.iterator(termsEnum); termsEnum.seekCeil(new BytesRef()); BytesRef term = termsEnum.term(); while(term != null){ int p = addWordToVocab(term.utf8ToString()); vocab[p].setCn((int)termsEnum.totalTermFreq()); term = termsEnum.next(); } } TokenStream tokenStream = null; CharTermAttribute termAtt = null; String[] values = new String[]{}; int valPos = 0; @Override public void rewind(int numThreads, int id) throws IOException { super.rewind(numThreads, id); tdPos = topDocs.totalHits / numThreads * id; } @Override public String nextWord() throws IOException { while(true){ // check the tokenStream first if(tokenStream != null && tokenStream.incrementToken()){ return new String(termAtt.buffer(), 0, termAtt.length()); } if(tokenStream != null) tokenStream.close(); if(valPos < values.length){ tokenStream = analyzer.tokenStream(field, values[valPos++]); termAtt = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); eoc = false; return null; } else{ if(tdPos >= topDocs.totalHits){ tokenStream = null; eoc = true; return null; // end of index == end of corpus } Document doc = reader.document(topDocs.scoreDocs[tdPos++].doc); values = doc.getValues(field); // This method returns an empty array when there are no matching fields. // It never returns null. valPos = 0; tokenStream = null; } } } @Override public void close() throws IOException { reader.close(); } }