package org.apache.lucene.analysis.ko; import org.apache.lucene.analysis.TokenStream; import org.openkoreantext.processor.OpenKoreanTextProcessor; import scala.collection.Iterator; import scala.collection.JavaConverters; import scala.collection.Seq; import java.util.Arrays; import static org.openkoreantext.processor.phrase_extractor.KoreanPhraseExtractor.KoreanPhrase; import static org.openkoreantext.processor.tokenizer.KoreanTokenizer.KoreanToken; /** * Phrase Extractor. For extracting phrase, it delegates token to {@link OpenKoreanTextProcessor} */ public class OpenKoreanTextPhraseExtractor extends OpenKoreanTextTokenFilter { public OpenKoreanTextPhraseExtractor(TokenStream input) { super(input); } @Override protected Seq<KoreanToken> perform(Seq<KoreanToken> tokens) { Seq<KoreanPhrase> phrases = OpenKoreanTextProcessor.extractPhrases(tokens, false, true); return convertPhrasesToTokens(phrases); } private Seq<KoreanToken> convertPhrasesToTokens(Seq<KoreanPhrase> phrases) { KoreanToken[] tokens = new KoreanToken[phrases.length()]; Iterator<KoreanPhrase> iterator = phrases.iterator(); int i = 0; while (iterator.hasNext()) { KoreanPhrase phrase = iterator.next(); tokens[i++] = new KoreanToken(phrase.text(), phrase.pos(), phrase.offset(), phrase.length(), scala.Option.apply(null), false); } Arrays.sort(tokens, (o1, o2) -> { if(o1.offset()== o2.offset()) return 0; return o1.offset()< o2.offset()? -1 : 1; }); return JavaConverters.asScalaBuffer(Arrays.asList(tokens)).toSeq(); } }