Java Code Examples for org.apache.lucene.analysis.CharFilter

The following examples show how to use org.apache.lucene.analysis.CharFilter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Test
public void testNormalizerCharFilter() throws Exception {
    String query = "한국어를 처리하는 예시입니닼ㅋ. 오픈코리안텍스틓ㅎㅎㅎㅎㅎㅎㅎ";
    String expected = "한국어를 처리하는 예시입니다ㅋ. 오픈코리안텍스트ㅎㅎㅎ";

    CharFilter inputReader = new OpenKoreanTextNormalizer(new StringReader(query));

    char[] tempBuff = new char[10];
    StringBuilder actual = new StringBuilder();

    while (true) {
        int length = inputReader.read(tempBuff);
        if (length == -1) break;
        actual.append(tempBuff, 0, length);
    }

    Assert.assertEquals(expected, actual.toString());
}
 
Example 2
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testNormalization() throws IOException {
  String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
  Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
  String expectedOutput = normalizer.normalize(input);

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
  char[] tempBuff = new char[10];
  StringBuilder output = new StringBuilder();
  while (true) {
    int length = reader.read(tempBuff);
    if (length == -1) {
      break;
    }
    output.append(tempBuff, 0, length);
    assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
  }

  assertEquals(expectedOutput, output.toString());
}
 
Example 3
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenStream() throws IOException {
  // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
  String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
    new int[] {0, 2, 4, 6, 8, 11, 14},
    new int[] {1, 3, 5, 7, 10, 13, 16},
    input.length());
}
 
Example 4
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
 
Example 5
Source Project: lucene-solr   Source File: TestICUNormalizer2CharFilter.java    License: Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example 6
Source Project: lucene-solr   Source File: TestSimplePatternTokenizer.java    License: Apache License 2.0 6 votes vote down vote up
public void testOffsetCorrection() throws Exception {
  final String INPUT = "G&uuml;nther G&uuml;nther is here";

  // create MappingCharFilter
  List<String> mappingRules = new ArrayList<>();
  mappingRules.add( "\"&uuml;\" => \"ü\"" );
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
  builder.add("&uuml;", "ü");
  NormalizeCharMap normMap = builder.build();
  CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));

  // create SimplePatternTokenizer
  Tokenizer stream = new SimplePatternTokenizer("Günther");
  stream.setReader(charStream);
  assertTokenStreamContents(stream,
      new String[] { "Günther", "Günther" },
      new int[] { 0, 13 },
      new int[] { 12, 25 },
      INPUT.length());
}
 
Example 7
public void testOffsetCorrection() throws Exception {
  final String INPUT = "G&uuml;nther G&uuml;nther is here";

  // create MappingCharFilter
  List<String> mappingRules = new ArrayList<>();
  mappingRules.add( "\"&uuml;\" => \"ü\"" );
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
  builder.add("&uuml;", "ü");
  NormalizeCharMap normMap = builder.build();
  CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));

  // create SimplePatternSplitTokenizer
  Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
  stream.setReader(charStream);
  assertTokenStreamContents(stream,
      new String[] { " ", " is here" },
      new int[] { 12, 25 },
      new int[] { 13, 33 },
      INPUT.length());
}
 
Example 8
Source Project: lucene-solr   Source File: TestRandomChains.java    License: Apache License 2.0 6 votes vote down vote up
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
  CharFilterSpec spec = new CharFilterSpec();
  spec.reader = reader;
  StringBuilder descr = new StringBuilder();
  int numFilters = random.nextInt(3);
  for (int i = 0; i < numFilters; i++) {
    while (true) {
      final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
      final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
      if (broken(ctor, args)) {
        continue;
      }
      reader = createComponent(ctor, args, descr, false);
      if (reader != null) {
        spec.reader = reader;
        break;
      }
    }
  }
  spec.toString = descr.toString();
  return spec;
}
 
Example 9
public void testKanjiOnly() throws IOException {
  // Test kanji only repetition marks
  CharFilter filter = new JapaneseIterationMarkCharFilter(
      new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
      true, // kanji
      false // no kana
  );
  assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
 
Example 10
public void testKanaOnly() throws IOException {
  // Test kana only repetition marks
  CharFilter filter = new JapaneseIterationMarkCharFilter(
      new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
      false, // no kanji
      true   // kana
  );
  assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
}
 
Example 11
public void testNone() throws IOException {
  // Test no repetition marks
  CharFilter filter = new JapaneseIterationMarkCharFilter(
      new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
      false, // no kanji
      false  // no kana
  );
  assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
 
Example 12
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
private void checkOutput(String input, String pattern, String replacement,
    String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
    CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
      new StringReader(input));

  StringBuilder output = new StringBuilder();
  for (int chr = cs.read(); chr > 0; chr = cs.read()) {
    output.append((char) chr);
  }

  StringBuilder indexMatched = new StringBuilder();
  for (int i = 0; i < output.length(); i++) {
    indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
  }

  boolean outputGood = expectedOutput.equals(output.toString());
  boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());

  if (!outputGood || !indexMatchedGood || false) {
    System.out.println("Pattern : " + pattern);
    System.out.println("Replac. : " + replacement);
    System.out.println("Input   : " + input);
    System.out.println("Output  : " + output);
    System.out.println("Expected: " + expectedOutput);
    System.out.println("Output/i: " + indexMatched);
    System.out.println("Expected: " + expectedIndexMatchedOutput);
    System.out.println();
  }

  assertTrue("Output doesn't match.", outputGood);
  assertTrue("Index-matched output doesn't match.", indexMatchedGood);
}
 
Example 13
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testNothingChange() throws IOException {
  final String BLOCK = "this is test.";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "this", "is", "test." },
      new int[] { 0, 5, 8 },
      new int[] { 4, 7, 13 }, 
      BLOCK.length());
}
 
Example 14
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testReplaceByEmpty() throws IOException {
  final String BLOCK = "aa bb cc";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[] {});
}
 
Example 15
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void test1block1matchSameLength() throws IOException {
  final String BLOCK = "aa bb cc";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa#bb#cc" },
      new int[] { 0 },
      new int[] { 8 }, 
      BLOCK.length());
}
 
Example 16
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void test1block1matchLonger() throws IOException {
  final String BLOCK = "aa bb cc dd";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa##bb###cc", "dd" },
      new int[] { 0, 9 },
      new int[] { 8, 11 },
      BLOCK.length());
}
 
Example 17
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void test1block2matchLonger() throws IOException {
  final String BLOCK = " a  a";
  CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa", "aa" },
      new int[] { 1, 4 },
      new int[] { 2, 5 },
      BLOCK.length());
}
 
Example 18
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void test1block1matchShorter() throws IOException {
  final String BLOCK = "aa  bb   cc dd";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa#bb", "dd" },
      new int[] { 0, 12 },
      new int[] { 11, 14 },
      BLOCK.length());
}
 
Example 19
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void test1blockMultiMatches() throws IOException {
  final String BLOCK = "  aa bb cc --- aa bb aa   bb   cc";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1  $2  $3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
      new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
      new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
      BLOCK.length());
}
 
Example 20
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void test2blocksMultiMatches() throws IOException {
  final String BLOCK = "  aa bb cc --- aa bb aa. bb aa   bb cc";

  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
      new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
      new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
      BLOCK.length());
}
 
Example 21
Source Project: lucene-solr   Source File: TestPatternReplaceCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testChain() throws IOException {
  final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
  CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa",
      new StringReader( BLOCK ) );
  cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs );
  cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
      new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
      new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
      BLOCK.length());
}
 
Example 22
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testReaderReset() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
  char[] buf = new char[10];
  int len = cs.read(buf, 0, 10);
  assertEquals( 1, len );
  assertEquals( 'x', buf[0]) ;
  len = cs.read(buf, 0, 10);
  assertEquals( -1, len );

  // rewind
  cs.reset();
  len = cs.read(buf, 0, 10);
  assertEquals( 1, len );
  assertEquals( 'x', buf[0]) ;
}
 
Example 23
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testTokenStream() throws Exception {
  String testString = "h i j k ll cccc bbb aa";
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
    new String[]{"i","i","jj","kkk","llll","cc","b","a"},
    new int[]{0,2,4,6,8,11,16,20},
    new int[]{1,3,5,7,10,15,19,22},
    testString.length()
  );
}
 
Example 24
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void testChained() throws Exception {
  String testString = "aaaa ll h";
  CharFilter cs = new MappingCharFilter( normMap,
      new MappingCharFilter( normMap, new StringReader( testString ) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
    new String[]{"a","llllllll","i"},
    new int[]{0,5,8},
    new int[]{4,7,9},
    testString.length()
  );
}
 
Example 25
private void assertCharFilterEquals(CharFilter filter, String expected) throws IOException {
  String actual = readFully(filter);
  assertEquals(expected, actual);
}
 
Example 26
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNothingChange() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"x"}, new int[]{0}, new int[]{1}, 1);
}
 
Example 27
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void test1to1() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "h" ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"i"}, new int[]{0}, new int[]{1}, 1);
}
 
Example 28
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void test1to2() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "j" ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"jj"}, new int[]{0}, new int[]{1}, 1);
}
 
Example 29
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void test1to3() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "k" ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"kkk"}, new int[]{0}, new int[]{1}, 1);
}
 
Example 30
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void test2to4() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "ll" ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"llll"}, new int[]{0}, new int[]{2}, 2);
}