/** * Copyright 2009 Alexander Kuznetsov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.morphology; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.morphology.analyzer.MorphologyAnalyzer; import org.apache.lucene.morphology.analyzer.MorphologyFilter; import org.apache.lucene.morphology.english.EnglishAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.morphology.english.EnglishLuceneMorphology; import org.apache.lucene.morphology.russian.RussianAnalyzer; import org.apache.lucene.morphology.russian.RussianLuceneMorphology; import org.junit.Test; import java.io.*; import java.util.*; import static org.hamcrest.Matchers.equalTo; public class AnalyzersTest extends BaseTokenStreamTestCase { @Test public void shouldGiveCorrectWordsForEnglish() throws IOException { Analyzer morphlogyAnalyzer = new EnglishAnalyzer(); String answerPath = "/english/english-analyzer-answer.txt"; String testPath = "/english/english-analyzer-data.txt"; testAnalayzer(morphlogyAnalyzer, answerPath, testPath); } @Test public void shouldGiveCorrectWordsForRussian() throws IOException { Analyzer morphlogyAnalyzer = new RussianAnalyzer(); String answerPath = "/russian/russian-analyzer-answer.txt"; String testPath = "/russian/russian-analyzer-data.txt"; testAnalayzer(morphlogyAnalyzer, answerPath, testPath); } @Test public void emptyStringTest() throws IOException { LuceneMorphology russianLuceneMorphology = new RussianLuceneMorphology(); LuceneMorphology englishLuceneMorphology = new EnglishLuceneMorphology(); MorphologyAnalyzer russianAnalyzer = new MorphologyAnalyzer(russianLuceneMorphology); InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("тест пм тест".getBytes()), "UTF-8"); TokenStream stream = russianAnalyzer.tokenStream(null, reader); MorphologyFilter englishFilter = new MorphologyFilter(stream, englishLuceneMorphology); englishFilter.reset(); while (englishFilter.incrementToken()) { System.out.println(englishFilter.toString()); } } @Test public void shouldProvideCorrectIndentForWordWithMelitaForm() throws IOException { Analyzer morphlogyAnalyzer = new RussianAnalyzer(); InputStreamReader reader = new InputStreamReader(new ByteArrayInputStream("принеси мне вина на новый год".getBytes()), "UTF-8"); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); tokenStream.reset(); Set<String> foromsOfWine = new HashSet<String>(); foromsOfWine.add("вина"); foromsOfWine.add("винo"); boolean wordSeen = false; while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute position = tokenStream.getAttribute(PositionIncrementAttribute.class); if(foromsOfWine.contains(charTerm.toString()) && wordSeen){ assertThat(position.getPositionIncrement(),equalTo(0)); } if(foromsOfWine.contains(charTerm.toString())){ wordSeen = true; } } } private void testAnalayzer(Analyzer morphlogyAnalyzer, String answerPath, String testPath) throws IOException { InputStream stream = this.getClass().getResourceAsStream(answerPath); BufferedReader breader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); String[] strings = breader.readLine().replaceAll(" +", " ").trim().split(" "); HashSet<String> answer = new HashSet<String>(Arrays.asList(strings)); stream.close(); stream = this.getClass().getResourceAsStream(testPath); InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); TokenStream tokenStream = morphlogyAnalyzer.tokenStream(null, reader); tokenStream.reset(); HashSet<String> result = new HashSet<String>(); while (tokenStream.incrementToken()) { CharTermAttribute attribute1 = tokenStream.getAttribute(CharTermAttribute.class); result.add(attribute1.toString()); } stream.close(); assertThat(result, equalTo(answer)); } @Test public void testPositionIncrement() throws IOException { EnglishAnalyzer englishAnalyzer = new EnglishAnalyzer(); assertTokenStreamContents( englishAnalyzer.tokenStream("test", "There are tests!"), new String[]{"there", "are", "be", "test"}, new int[]{0, 6, 6, 10}, new int[]{5, 9, 9, 15}, new String[]{"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>"}, new int[]{1, 1, 0, 1} ); } @Test public void testKeywordHandling() throws IOException { Analyzer analyzer = new EnglishKeywordTestAnalyzer(); assertTokenStreamContents( analyzer.tokenStream("test", "Tests shouldn't be stemmed, but tests should!"), new String[]{"tests", "shouldn't", "be", "stem", "but", "test", "shall"} ); } private static class EnglishKeywordTestAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String s) { StandardTokenizer src = new StandardTokenizer(); CharArraySet dontStem = new CharArraySet(1, false); dontStem.add("Tests"); TokenFilter filter = new SetKeywordMarkerFilter(src, dontStem); filter = new LowerCaseFilter(filter); try { filter = new MorphologyFilter(filter, new EnglishLuceneMorphology()); } catch (IOException ex) { throw new RuntimeException("cannot create EnglishLuceneMorphology", ex); } return new TokenStreamComponents(src, filter); } } }