org.apache.lucene.analysis.CharFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.CharFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OpenKoreanTextNormalizerTest.java    From elasticsearch-analysis-openkoreantext with Apache License 2.0 6 votes vote down vote up
@Test
public void testNormalizerCharFilter() throws Exception {
    String query = "한국어를 처리하는 예시입니닼ㅋ. 오픈코리안텍스틓ㅎㅎㅎㅎㅎㅎㅎ";
    String expected = "한국어를 처리하는 예시입니다ㅋ. 오픈코리안텍스트ㅎㅎㅎ";

    CharFilter inputReader = new OpenKoreanTextNormalizer(new StringReader(query));

    char[] tempBuff = new char[10];
    StringBuilder actual = new StringBuilder();

    while (true) {
        int length = inputReader.read(tempBuff);
        if (length == -1) break;
        actual.append(tempBuff, 0, length);
    }

    Assert.assertEquals(expected, actual.toString());
}
 
Example #2
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private CharFilterSpec newCharFilterChain(Random random, Reader reader) {
  CharFilterSpec spec = new CharFilterSpec();
  spec.reader = reader;
  StringBuilder descr = new StringBuilder();
  int numFilters = random.nextInt(3);
  for (int i = 0; i < numFilters; i++) {
    while (true) {
      final Constructor<? extends CharFilter> ctor = charfilters.get(random.nextInt(charfilters.size()));
      final Object args[] = newCharFilterArgs(random, spec.reader, ctor.getParameterTypes());
      if (broken(ctor, args)) {
        continue;
      }
      reader = createComponent(ctor, args, descr, false);
      if (reader != null) {
        spec.reader = reader;
        break;
      }
    }
  }
  spec.toString = descr.toString();
  return spec;
}
 
Example #3
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testNormalization() throws IOException {
  String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
  Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
  String expectedOutput = normalizer.normalize(input);

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
  char[] tempBuff = new char[10];
  StringBuilder output = new StringBuilder();
  while (true) {
    int length = reader.read(tempBuff);
    if (length == -1) {
      break;
    }
    output.append(tempBuff, 0, length);
    assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
  }

  assertEquals(expectedOutput, output.toString());
}
 
Example #4
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testTokenStream() throws IOException {
  // '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
  String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ"},
    new int[] {0, 2, 4, 6, 8, 11, 14},
    new int[] {1, 3, 5, 7, 10, 13, 16},
    input.length());
}
 
Example #5
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testTokenStream2() throws IOException {
  // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
  String input = "㌰゙5℃№㈱㌘ザゾ";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"},
    new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9},
    new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11},
    input.length()
  );
}
 
Example #6
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example #7
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testOffsetCorrection() throws Exception {
  final String INPUT = "G&uuml;nther G&uuml;nther is here";

  // create MappingCharFilter
  List<String> mappingRules = new ArrayList<>();
  mappingRules.add( "\"&uuml;\" => \"ü\"" );
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
  builder.add("&uuml;", "ü");
  NormalizeCharMap normMap = builder.build();
  CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));

  // create SimplePatternTokenizer
  Tokenizer stream = new SimplePatternTokenizer("Günther");
  stream.setReader(charStream);
  assertTokenStreamContents(stream,
      new String[] { "Günther", "Günther" },
      new int[] { 0, 13 },
      new int[] { 12, 25 },
      INPUT.length());
}
 
Example #8
Source File: TestSimplePatternSplitTokenizer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testOffsetCorrection() throws Exception {
  final String INPUT = "G&uuml;nther G&uuml;nther is here";

  // create MappingCharFilter
  List<String> mappingRules = new ArrayList<>();
  mappingRules.add( "\"&uuml;\" => \"ü\"" );
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
  builder.add("&uuml;", "ü");
  NormalizeCharMap normMap = builder.build();
  CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));

  // create SimplePatternSplitTokenizer
  Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
  stream.setReader(charStream);
  assertTokenStreamContents(stream,
      new String[] { " ", " is here" },
      new int[] { 12, 25 },
      new int[] { 13, 33 },
      INPUT.length());
}
 
Example #9
Source File: TestJapaneseIterationMarkCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKanjiOnly() throws IOException {
  // Test kanji only repetition marks
  CharFilter filter = new JapaneseIterationMarkCharFilter(
      new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
      true, // kanji
      false // no kana
  );
  assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
 
Example #10
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testChained() throws Exception {
  String testString = "aaaa ll h";
  CharFilter cs = new MappingCharFilter( normMap,
      new MappingCharFilter( normMap, new StringReader( testString ) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
    new String[]{"a","llllllll","i"},
    new int[]{0,5,8},
    new int[]{4,7,9},
    testString.length()
  );
}
 
Example #11
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTokenStream() throws Exception {
  String testString = "h i j k ll cccc bbb aa";
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( testString ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
    new String[]{"i","i","jj","kkk","llll","cc","b","a"},
    new int[]{0,2,4,6,8,11,16,20},
    new int[]{1,3,5,7,10,15,19,22},
    testString.length()
  );
}
 
Example #12
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReaderReset() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "x" ) );
  char[] buf = new char[10];
  int len = cs.read(buf, 0, 10);
  assertEquals( 1, len );
  assertEquals( 'x', buf[0]) ;
  len = cs.read(buf, 0, 10);
  assertEquals( -1, len );

  // rewind
  cs.reset();
  len = cs.read(buf, 0, 10);
  assertEquals( 1, len );
  assertEquals( 'x', buf[0]) ;
}
 
Example #13
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testChain() throws IOException {
  final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
  CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa",
      new StringReader( BLOCK ) );
  cs = new PatternReplaceCharFilter( pattern("bb"), "b", cs );
  cs = new PatternReplaceCharFilter( pattern("ccc"), "c", cs );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
      new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
      new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
      BLOCK.length());
}
 
Example #14
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test2blocksMultiMatches() throws IOException {
  final String BLOCK = "  aa bb cc --- aa bb aa. bb aa   bb cc";

  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
      new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
      new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
      BLOCK.length());
}
 
Example #15
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test1blockMultiMatches() throws IOException {
  final String BLOCK = "  aa bb cc --- aa bb aa   bb   cc";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1  $2  $3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
      new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
      new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
      BLOCK.length());
}
 
Example #16
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test1block1matchShorter() throws IOException {
  final String BLOCK = "aa  bb   cc dd";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa#bb", "dd" },
      new int[] { 0, 12 },
      new int[] { 11, 14 },
      BLOCK.length());
}
 
Example #17
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test1block1matchLonger() throws IOException {
  final String BLOCK = "aa bb cc dd";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa##bb###cc", "dd" },
      new int[] { 0, 9 },
      new int[] { 8, 11 },
      BLOCK.length());
}
 
Example #18
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test1block1matchSameLength() throws IOException {
  final String BLOCK = "aa bb cc";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa#bb#cc" },
      new int[] { 0 },
      new int[] { 8 }, 
      BLOCK.length());
}
 
Example #19
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReplaceByEmpty() throws IOException {
  final String BLOCK = "aa bb cc";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[] {});
}
 
Example #20
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNothingChange() throws IOException {
  final String BLOCK = "this is test.";
  CharFilter cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "this", "is", "test." },
      new int[] { 0, 5, 8 },
      new int[] { 4, 7, 13 }, 
      BLOCK.length());
}
 
Example #21
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void checkOutput(String input, String pattern, String replacement,
    String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
    CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement,
      new StringReader(input));

  StringBuilder output = new StringBuilder();
  for (int chr = cs.read(); chr > 0; chr = cs.read()) {
    output.append((char) chr);
  }

  StringBuilder indexMatched = new StringBuilder();
  for (int i = 0; i < output.length(); i++) {
    indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
  }

  boolean outputGood = expectedOutput.equals(output.toString());
  boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());

  if (!outputGood || !indexMatchedGood || false) {
    System.out.println("Pattern : " + pattern);
    System.out.println("Replac. : " + replacement);
    System.out.println("Input   : " + input);
    System.out.println("Output  : " + output);
    System.out.println("Expected: " + expectedOutput);
    System.out.println("Output/i: " + indexMatched);
    System.out.println("Expected: " + expectedIndexMatchedOutput);
    System.out.println();
  }

  assertTrue("Output doesn't match.", outputGood);
  assertTrue("Index-matched output doesn't match.", indexMatchedGood);
}
 
Example #22
Source File: TestPatternReplaceCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void test1block2matchLonger() throws IOException {
  final String BLOCK = " a  a";
  CharFilter cs = new PatternReplaceCharFilter( pattern("a"), "aa",
        new StringReader( BLOCK ) );
  TokenStream ts = whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts,
      new String[] { "aa", "aa" },
      new int[] { 1, 4 },
      new int[] { 2, 5 },
      BLOCK.length());
}
 
Example #23
Source File: TestJapaneseIterationMarkCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKanaOnly() throws IOException {
  // Test kana only repetition marks
  CharFilter filter = new JapaneseIterationMarkCharFilter(
      new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
      false, // no kanji
      true   // kana
  );
  assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
}
 
Example #24
Source File: TestJapaneseIterationMarkCharFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testNone() throws IOException {
  // Test no repetition marks
  CharFilter filter = new JapaneseIterationMarkCharFilter(
      new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"),
      false, // no kanji
      false  // no kana
  );
  assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
 
Example #25
Source File: TestBugInSomething.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testWrapping() throws Exception {
  CharFilter cs = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(wrappedStream);
  Exception expected = expectThrows(Exception.class, () -> {
    cs.mark(1);
  });
  assertEquals("mark(int)", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.markSupported();
  });
  assertEquals("markSupported()", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.read();
  });
  assertEquals("read()", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.read(new char[0]);
  });
  assertEquals("read(char[])", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.read(CharBuffer.wrap(new char[0]));
  });
  assertEquals("read(CharBuffer)", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.reset();
  });
  assertEquals("reset()", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.skip(1);
  });
  assertEquals("skip(long)", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.correctOffset(1);
  });
  assertEquals("correct(int)", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.close();
  });
  assertEquals("close()", expected.getMessage());
  
  expected = expectThrows(Exception.class, () -> {
    cs.read(new char[0], 0, 0);
  });
  assertEquals("read(char[], int, int)", expected.getMessage());
}
 
Example #26
Source File: TestRandomChains.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {
  List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
  tokenizers = new ArrayList<>();
  tokenfilters = new ArrayList<>();
  charfilters = new ArrayList<>();
  for (final Class<?> c : analysisClasses) {
    final int modifiers = c.getModifiers();
    if (
      // don't waste time with abstract classes or deprecated known-buggy ones
      Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
      || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
      || c.isAnnotationPresent(Deprecated.class)
      || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
    ) {
      continue;
    }
    
    for (final Constructor<?> ctor : c.getConstructors()) {
      // don't test synthetic or deprecated ctors, they likely have known bugs:
      if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
        continue;
      }
      // conditional filters are tested elsewhere
      if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
        continue;
      }
      if (Tokenizer.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        tokenizers.add(castConstructor(Tokenizer.class, ctor));
      } else if (TokenFilter.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        tokenfilters.add(castConstructor(TokenFilter.class, ctor));
      } else if (CharFilter.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        charfilters.add(castConstructor(CharFilter.class, ctor));
      } else {
        fail("Cannot get here");
      }
    }
  }
  
  final Comparator<Constructor<?>> ctorComp = (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString());
  Collections.sort(tokenizers, ctorComp);
  Collections.sort(tokenfilters, ctorComp);
  Collections.sort(charfilters, ctorComp);
  if (VERBOSE) {
    System.out.println("tokenizers = " + tokenizers);
    System.out.println("tokenfilters = " + tokenfilters);
    System.out.println("charfilters = " + charfilters);
  }
}
 
Example #27
Source File: TestJapaneseIterationMarkCharFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void assertCharFilterEquals(CharFilter filter, String expected) throws IOException {
  String actual = readFully(filter);
  assertEquals(expected, actual);
}
 
Example #28
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testFullWidthChar() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "\uff01") );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"full-width-exclamation"}, new int[]{0}, new int[]{1}, 1);
}
 
Example #29
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testNonBMPChar() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}
 
Example #30
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void test5to0() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( "empty" ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[0], new int[]{}, new int[]{}, 5);
}