Java Code Examples for org.apache.lucene.util.UnicodeUtil

The following examples show how to use org.apache.lucene.util.UnicodeUtil. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Elasticsearch   Source File: LowerFunction.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toLowerCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}
 
Example 2
Source Project: Elasticsearch   Source File: UpperFunction.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toUpperCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}
 
Example 3
Source Project: lucene-solr   Source File: TestJapaneseTokenizer.java    License: Apache License 2.0 6 votes vote down vote up
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}
 
Example 4
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

  builder.add( "aa", "a" );
  builder.add( "bbb", "b" );
  builder.add( "cccc", "cc" );

  builder.add( "h", "i" );
  builder.add( "j", "jj" );
  builder.add( "k", "kkk" );
  builder.add( "ll", "llll" );

  builder.add( "empty", "" );

  // BMP (surrogate pair):
  builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

  builder.add("\uff01", "full-width-exclamation");

  normMap = builder.build();
}
 
Example 5
Source Project: lucene-solr   Source File: FuzzyAutomatonBuilder.java    License: Apache License 2.0 6 votes vote down vote up
FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be less than 0");
  }
  this.term = term;
  this.maxEdits = maxEdits;
  int[] codePoints = stringToUTF32(term);
  this.termLength = codePoints.length;
  prefixLength = Math.min(prefixLength, codePoints.length);
  int[] suffix = new int[codePoints.length - prefixLength];
  System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
  this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
  this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}
 
Example 6
Source Project: lucene-solr   Source File: DaciukMihovAutomatonBuilder.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
  
  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }
  
  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
 
Example 7
Source Project: lucene-solr   Source File: ByteBuffersDataOutput.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void writeString(String v) {
  try {
    final int MAX_CHARS_PER_WINDOW = 1024;
    if (v.length() <= MAX_CHARS_PER_WINDOW) {
      final BytesRef utf8 = new BytesRef(v);
      writeVInt(utf8.length);
      writeBytes(utf8.bytes, utf8.offset, utf8.length);
    } else {
      writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
      final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
      UTF16toUTF8(v, 0, v.length(), buf, (len) -> {
        writeBytes(buf, 0, len);
      });
    }
  } catch (IOException e) {
    throw new UncheckedIOException(e);
  }    
}
 
Example 8
Source Project: lucene-solr   Source File: FacetFieldProcessorByArrayUIF.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected void findStartAndEndOrds() throws IOException {
  uif = UnInvertedField.getUnInvertedField(freq.field, fcontext.searcher);
  te = uif.getOrdTermsEnum( fcontext.searcher.getSlowAtomicReader() );    // "te" can be null

  startTermIndex = 0;
  endTermIndex = uif.numTerms();  // one past the end

  if (prefixRef != null && te != null) {
    if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) {
      startTermIndex = uif.numTerms();
    } else {
      startTermIndex = (int) te.ord();
    }
    prefixRef.append(UnicodeUtil.BIG_TERM);
    if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) {
      endTermIndex = uif.numTerms();
    } else {
      endTermIndex = (int) te.ord();
    }
  }

  nTerms = endTermIndex - startTermIndex;
}
 
Example 9
/**
 * Wraps the Lucene UnicodeUtil.UTF16toUTF8 bytes serializatiom...
 */
public static byte[] toBytes(String value, boolean desc){
    if(value==null) return Encoding.EMPTY_BYTE_ARRAY;
    if(value.isEmpty()){
        if(desc)
            return new byte[]{(byte)(0x01^0xff)};
        else
            return new byte[]{0x01};
    }

    //convert to UTF-8 encoding
    BytesRef result = new BytesRef();
    UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
    byte[] returnArray = new byte[result.length];
    for(int i=0;i<result.length;i++){
        byte newD = (byte)(result.bytes[i+result.offset] + 2);
        if(desc)
            newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement
        returnArray[i] = newD;
    }
    return returnArray;
}
 
Example 10
Source Project: lucene-solr   Source File: LabelledCharArrayMatcher.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns a representation of the automaton that matches char[] instead of byte[]
 */
static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
    return wrap(label, (chars, offset, length) -> {
      int state = 0;
      final int maxIdx = offset + length;
      for (int i = offset; i < maxIdx; i++) {
        final int code = chars[i];
        int b;
        // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
        if (code < 0x80) {
          state = runAutomaton.step(state, code);
          if (state == -1) return false;
        } else if (code < 0x800) {
          b = (0xC0 | (code >> 6));
          state = runAutomaton.step(state, b);
          if (state == -1) return false;
          b = (0x80 | (code & 0x3F));
          state = runAutomaton.step(state, b);
          if (state == -1) return false;
        } else {
          // more complex
          byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
          int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
          for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
            state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
            if (state == -1) return false;
          }
          break;
        }
      }
      return runAutomaton.isAccept(state);
    });
}
 
Example 11
Source Project: lucene-solr   Source File: UTF8TaxonomyWriterCache.java    License: Apache License 2.0 5 votes vote down vote up
private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}
 
Example 12
Source Project: lucene-solr   Source File: TestExtendedMode.java    License: Apache License 2.0 5 votes vote down vote up
/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}
 
Example 13
Source Project: lucene-solr   Source File: AutomatonTestUtil.java    License: Apache License 2.0 5 votes vote down vote up
/** Returns random string, including full unicode range. */
public static String randomRegexp(Random r) {
  while (true) {
    String regexp = randomRegexpString(r);
    // we will also generate some undefined unicode queries
    if (!UnicodeUtil.validUTF16String(regexp))
      continue;
    try {
      new RegExp(regexp, RegExp.NONE);
      return regexp;
    } catch (Exception e) {}
  }
}
 
Example 14
Source Project: lucene-solr   Source File: FSTTester.java    License: Apache License 2.0 5 votes vote down vote up
static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
  if (!isValidUnicode) {
    return term.toString();
  } else if (inputMode == 0) {
    // utf8
    return toBytesRef(term).utf8ToString() + " " + term;
  } else {
    // utf32
    return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
  }
}
 
Example 15
Source Project: lucene-solr   Source File: TestIndexWriterUnicode.java    License: Apache License 2.0 5 votes vote down vote up
public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }
 
Example 16
Source Project: lucene-solr   Source File: TestUTF32ToUTF8.java    License: Apache License 2.0 5 votes vote down vote up
private void assertAutomaton(Automaton automaton) throws Exception {
  CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
  ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
  final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
  
  int num = atLeast(1000);
  for (int i = 0; i < num; i++) {
    final String string;
    if (random().nextBoolean()) {
      // likely not accepted
      string = TestUtil.randomUnicodeString(random());
    } else {
      // will be accepted
      int[] codepoints = ras.getRandomAcceptedString(random());
      try {
        string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
      } catch (Exception e) {
        System.out.println(codepoints.length + " codepoints:");
        for(int j=0;j<codepoints.length;j++) {
          System.out.println("  " + Integer.toHexString(codepoints[j]));
        }
        throw e;
      }
    }
    byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
    assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
  }
}
 
Example 17
Source Project: lucene-solr   Source File: PHPSerializedResponseWriter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void writeStr(String name, String val, boolean needsEscaping) throws IOException {
  // serialized PHP strings don't need to be escaped at all, however the 
  // string size reported needs be the number of bytes rather than chars.
  utf8 = ArrayUtil.grow(utf8, val.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
  final int nBytes = UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8);

  writer.write("s:");
  writer.write(Integer.toString(nBytes));
  writer.write(":\"");
  writer.write(val);
  writer.write("\";");
}
 
Example 18
Source Project: lucene-solr   Source File: IndexSizeEstimator.java    License: Apache License 2.0 5 votes vote down vote up
/** Process a string field. */
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
  // trim the value if needed
  int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
  if (value.length() > maxLength) {
    value = value.substring(0, maxLength);
  }
  countItem(fieldInfo.name, value, len);
}
 
Example 19
public static int toBytes(String value, boolean desc, byte[] buffer, int offset){
    if(value==null || value.isEmpty()) return 0;

    //convert to UTF-8 encoding
    BytesRef result = new BytesRef();
    UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
    for(int i=0;i<result.length;i++){
        byte newD = (byte)(result.bytes[i+result.offset] + 2);
        if(desc)
            newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement
        buffer[offset+i] = newD;
    }
    return value.length();
}
 
Example 20
Source Project: crate   Source File: PartitionName.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Read utf8 bytes for bwc, with 0 as `null` indicator
 */
private static String readValueFrom(StreamInput in) throws IOException {
    int length = in.readVInt() - 1;
    if (length == -1) {
        return null;
    }
    if (length == 0) {
        return "";
    }
    byte[] bytes = new byte[length];
    in.readBytes(bytes, 0, length);
    char[] chars = new char[length];
    int len = UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars);
    return new String(chars, 0, len);
}
 
Example 21
Source Project: crate   Source File: LineContext.java    License: Apache License 2.0 5 votes vote down vote up
@Nullable
String sourceAsString() {
    if (rawSource != null) {
        char[] chars = new char[rawSource.length];
        int len = UnicodeUtil.UTF8toUTF16(rawSource, 0, rawSource.length, chars);
        return new String(chars, 0, len);
    }
    return null;
}
 
Example 22
Source Project: Elasticsearch   Source File: RegexMatcher.java    License: Apache License 2.0 4 votes vote down vote up
private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) {
    if (charsRef.chars.length < bytes.length) {
        charsRef.chars = new char[bytes.length];
    }
    charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars);
}
 
Example 23
Source Project: lucene-solr   Source File: TermGroupFacetCollector.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  if (segmentFacetCounts != null) {
    segmentResults.add(createSegmentResult());
  }

  groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField);
  facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField);
  facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount();
  if (facetFieldNumTerms == 0) {
    facetOrdTermsEnum = null;
  } else {
    facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum();
  }
  // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field
  segmentFacetCounts = new int[facetFieldNumTerms + 1];
  segmentTotalCount = 0;

  segmentGroupedFacetHits.clear();
  for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
    int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue);
    if (groupedFacetHit.groupValue != null && groupOrd < 0) {
      continue;
    }

    int facetOrd;
    if (groupedFacetHit.facetValue != null) {
      if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) {
        continue;
      }
      facetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      facetOrd = facetFieldNumTerms;
    }

    // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field
    int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd;
    segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
  }

  if (facetPrefix != null) {
    TermsEnum.SeekStatus seekStatus;
    if (facetOrdTermsEnum != null) {
      seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix);
    } else {
      seekStatus = TermsEnum.SeekStatus.END;
    }

    if (seekStatus != TermsEnum.SeekStatus.END) {
      startFacetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      startFacetOrd = 0;
      endFacetOrd = 0;
      return;
    }

    BytesRefBuilder facetEndPrefix = new BytesRefBuilder();
    facetEndPrefix.append(facetPrefix);
    facetEndPrefix.append(UnicodeUtil.BIG_TERM);
    seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get());
    if (seekStatus != TermsEnum.SeekStatus.END) {
      endFacetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      endFacetOrd = facetFieldNumTerms; // Don't include null...
    }
  } else {
    startFacetOrd = 0;
    endFacetOrd = facetFieldNumTerms + 1;
  }
}
 
Example 24
Source Project: lucene-solr   Source File: TokenInfoDictionaryTest.java    License: Apache License 2.0 4 votes vote down vote up
/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
  // just for debugging
  int numTerms = 0;
  int numWords = 0;
  int lastWordId = -1;
  int lastSourceId = -1;
  TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
  ConnectionCosts matrix = ConnectionCosts.getInstance();
  FST<Long> fst = tid.getFST().getInternalFST();
  IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
  IntsRefFSTEnum.InputOutput<Long> mapping;
  IntsRef scratch = new IntsRef();
  while ((mapping = fstEnum.next()) != null) {
    numTerms++;
    IntsRef input = mapping.input;
    char[] chars = new char[input.length];
    for (int i = 0; i < chars.length; i++) {
      chars[i] = (char)input.ints[input.offset+i];
    }
    assertTrue(UnicodeUtil.validUTF16String(new String(chars)));

    Long output = mapping.output;
    int sourceId = output.intValue();
    // we walk in order, terms, sourceIds, and wordIds should always be increasing
    assertTrue(sourceId > lastSourceId);
    lastSourceId = sourceId;
    tid.lookupWordIds(sourceId, scratch);
    for (int i = 0; i < scratch.length; i++) {
      numWords++;
      int wordId = scratch.ints[scratch.offset+i];
      assertTrue(wordId > lastWordId);
      lastWordId = wordId;

      String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
      assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));

      String inflectionForm = tid.getInflectionForm(wordId);
      assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
      if (inflectionForm != null) {
        // check that it's actually an ipadic inflection form
        assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
      }

      String inflectionType = tid.getInflectionType(wordId);
      assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
      if (inflectionType != null) {
        // check that it's actually an ipadic inflection type
        assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
      }

      int leftId = tid.getLeftId(wordId);
      int rightId = tid.getRightId(wordId);

      matrix.get(rightId, leftId);

      tid.getWordCost(wordId);

      String pos = tid.getPartOfSpeech(wordId);
      assertNotNull(pos);
      assertTrue(UnicodeUtil.validUTF16String(pos));
      // check that it's actually an ipadic pos tag
      assertNotNull(ToStringUtil.getPOSTranslation(pos));

      String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
      assertNotNull(pronunciation);
      assertTrue(UnicodeUtil.validUTF16String(pronunciation));

      String reading = tid.getReading(wordId, chars, 0, chars.length);
      assertNotNull(reading);
      assertTrue(UnicodeUtil.validUTF16String(reading));
    }
  }
  if (VERBOSE) {
    System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
  }
}
 
Example 25
Source Project: lucene-solr   Source File: TestMappingCharFilter.java    License: Apache License 2.0 4 votes vote down vote up
public void testNonBMPChar() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}
 
Example 26
Source Project: lucene-solr   Source File: FuzzyTermsEnum.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public BytesRef next() throws IOException {

  if (queuedBottom != null) {
    bottomChanged(queuedBottom);
    queuedBottom = null;
  }
  

  BytesRef term;

  term = actualEnum.next();
  if (term == null) {
    // end
    return null;
  }

  int ed = maxEdits;
    
  // we know the outer DFA always matches.
  // now compute exact edit distance
  while (ed > 0) {
    if (matches(term, ed - 1)) {
      ed--;
    } else {
      break;
    }
  }
    
  if (ed == 0) { // exact match
    boostAtt.setBoost(1.0F);
  } else {
    final int codePointCount = UnicodeUtil.codePointCount(term);
    int minTermLength = Math.min(codePointCount, termLength);

    float similarity = 1.0f - (float) ed / (float) minTermLength;
    boostAtt.setBoost(similarity);
  }
    
  final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
  final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
  if (bottom != this.bottom || bottomTerm != this.bottomTerm) {
    this.bottom = bottom;
    this.bottomTerm = bottomTerm;
    // clone the term before potentially doing something with it
    // this is a rare but wonderful occurrence anyway

    // We must delay bottomChanged until the next next() call otherwise we mess up docFreq(), etc., for the current term:
    queuedBottom = BytesRef.deepCopyOf(term);
  }
  
  return term;
}
 
Example 27
Source Project: lucene-solr   Source File: TestIndexWriterUnicode.java    License: Apache License 2.0 4 votes vote down vote up
public void testTermUTF16SortOrder() throws Throwable {
  Random rnd = random();
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(rnd, dir);
  Document d = new Document();
  // Single segment
  Field f = newStringField("f", "", Field.Store.NO);
  d.add(f);
  char[] chars = new char[2];
  final Set<String> allTerms = new HashSet<>();

  int num = atLeast(200);
  for (int i = 0; i < num; i++) {

    final String s;
    if (rnd.nextBoolean()) {
      // Single char
      if (rnd.nextBoolean()) {
        // Above surrogates
        chars[0] = (char) getInt(rnd, 1+UnicodeUtil.UNI_SUR_LOW_END, 0xffff);
      } else {
        // Below surrogates
        chars[0] = (char) getInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START-1);
      }
      s = new String(chars, 0, 1);
    } else {
      // Surrogate pair
      chars[0] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END);
      assertTrue(((int) chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int) chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END);
      chars[1] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END);
      s = new String(chars, 0, 2);
    }
    allTerms.add(s);
    f.setStringValue(s);

    writer.addDocument(d);

    if ((1+i) % 42 == 0) {
      writer.commit();
    }
  }

  IndexReader r = writer.getReader();

  // Test each sub-segment
  for (LeafReaderContext ctx : r.leaves()) {
    checkTermsOrder(ctx.reader(), allTerms, false);
  }
  checkTermsOrder(r, allTerms, true);

  // Test multi segment
  r.close();

  writer.forceMerge(1);

  // Test single segment
  r = writer.getReader();
  checkTermsOrder(r, allTerms, true);
  r.close();

  writer.close();
  dir.close();
}
 
Example 28
Source Project: lucene-solr   Source File: TestUTF32ToUTF8.java    License: Apache License 2.0 4 votes vote down vote up
private boolean matches(ByteRunAutomaton a, int code) {
  char[] chars = Character.toChars(code);
  byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
  final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
  return a.run(b, 0, len);
}
 
Example 29
Source Project: lucene-solr   Source File: TestUTF32ToUTF8.java    License: Apache License 2.0 4 votes vote down vote up
private static boolean isSurrogate(int code) {
  return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END;
}
 
Example 30
Source Project: incubator-retired-blur   Source File: SerializerUtil.java    License: Apache License 2.0 4 votes vote down vote up
public static void writeString(String s, DataOutput out) throws IOException {
  BytesRef bytes = new BytesRef();
  UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytes);
  writeBytesRef(bytes, out);
}