Java Code Examples for org.apache.lucene.util.UnicodeUtil#newString()

The following examples show how to use org.apache.lucene.util.UnicodeUtil#newString() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FuzzyAutomatonBuilder.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be less than 0");
  }
  this.term = term;
  this.maxEdits = maxEdits;
  int[] codePoints = stringToUTF32(term);
  this.termLength = codePoints.length;
  prefixLength = Math.min(prefixLength, codePoints.length);
  int[] suffix = new int[codePoints.length - prefixLength];
  System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
  this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
  this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}
 
Example 2
Source File: FSTTester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
  if (!isValidUnicode) {
    return term.toString();
  } else if (inputMode == 0) {
    // utf8
    return toBytesRef(term).utf8ToString() + " " + term;
  } else {
    // utf32
    return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
  }
}
 
Example 3
Source File: TestUTF32ToUTF8.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private void assertAutomaton(Automaton automaton) throws Exception {
  CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
  ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
  final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
  
  int num = atLeast(1000);
  for (int i = 0; i < num; i++) {
    final String string;
    if (random().nextBoolean()) {
      // likely not accepted
      string = TestUtil.randomUnicodeString(random());
    } else {
      // will be accepted
      int[] codepoints = ras.getRandomAcceptedString(random());
      try {
        string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
      } catch (Exception e) {
        System.out.println(codepoints.length + " codepoints:");
        for(int j=0;j<codepoints.length;j++) {
          System.out.println("  " + Integer.toHexString(codepoints[j]));
        }
        throw e;
      }
    }
    byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
    assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
  }
}
 
Example 4
Source File: TestMappingCharFilter.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public void testNonBMPChar() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}