org.apache.lucene.util.UnicodeUtil Java Examples

The following examples show how to use org.apache.lucene.util.UnicodeUtil. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: UpperFunction.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toUpperCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}

Example #2

Source File: StringEncoding.java From spliceengine with GNU Affero General Public License v3.0

6 votes

/**
 * Wraps the Lucene UnicodeUtil.UTF16toUTF8 bytes serializatiom...
 */
public static byte[] toBytes(String value, boolean desc){
    if(value==null) return Encoding.EMPTY_BYTE_ARRAY;
    if(value.isEmpty()){
        if(desc)
            return new byte[]{(byte)(0x01^0xff)};
        else
            return new byte[]{0x01};
    }

    //convert to UTF-8 encoding
    BytesRef result = new BytesRef();
    UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
    byte[] returnArray = new byte[result.length];
    for(int i=0;i<result.length;i++){
        byte newD = (byte)(result.bytes[i+result.offset] + 2);
        if(desc)
            newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement
        returnArray[i] = newD;
    }
    return returnArray;
}

Example #3

Source File: FacetFieldProcessorByArrayUIF.java From lucene-solr with Apache License 2.0

6 votes

@Override
protected void findStartAndEndOrds() throws IOException {
  uif = UnInvertedField.getUnInvertedField(freq.field, fcontext.searcher);
  te = uif.getOrdTermsEnum( fcontext.searcher.getSlowAtomicReader() );    // "te" can be null

  startTermIndex = 0;
  endTermIndex = uif.numTerms();  // one past the end

  if (prefixRef != null && te != null) {
    if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) {
      startTermIndex = uif.numTerms();
    } else {
      startTermIndex = (int) te.ord();
    }
    prefixRef.append(UnicodeUtil.BIG_TERM);
    if (te.seekCeil(prefixRef.get()) == TermsEnum.SeekStatus.END) {
      endTermIndex = uif.numTerms();
    } else {
      endTermIndex = (int) te.ord();
    }
  }

  nTerms = endTermIndex - startTermIndex;
}

Example #4

Source File: ByteBuffersDataOutput.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void writeString(String v) {
  try {
    final int MAX_CHARS_PER_WINDOW = 1024;
    if (v.length() <= MAX_CHARS_PER_WINDOW) {
      final BytesRef utf8 = new BytesRef(v);
      writeVInt(utf8.length);
      writeBytes(utf8.bytes, utf8.offset, utf8.length);
    } else {
      writeVInt(UnicodeUtil.calcUTF16toUTF8Length(v, 0, v.length()));
      final byte [] buf = new byte [UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * MAX_CHARS_PER_WINDOW];
      UTF16toUTF8(v, 0, v.length(), buf, (len) -> {
        writeBytes(buf, 0, len);
      });
    }
  } catch (IOException e) {
    throw new UncheckedIOException(e);
  }    
}

Example #5

Source File: DaciukMihovAutomatonBuilder.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();
  
  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }
  
  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}

Example #6

Source File: FuzzyAutomatonBuilder.java From lucene-solr with Apache License 2.0

6 votes

FuzzyAutomatonBuilder(String term, int maxEdits, int prefixLength, boolean transpositions) {
  if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
    throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
  }
  if (prefixLength < 0) {
    throw new IllegalArgumentException("prefixLength cannot be less than 0");
  }
  this.term = term;
  this.maxEdits = maxEdits;
  int[] codePoints = stringToUTF32(term);
  this.termLength = codePoints.length;
  prefixLength = Math.min(prefixLength, codePoints.length);
  int[] suffix = new int[codePoints.length - prefixLength];
  System.arraycopy(codePoints, prefixLength, suffix, 0, suffix.length);
  this.levBuilder = new LevenshteinAutomata(suffix, Character.MAX_CODE_POINT, transpositions);
  this.prefix = UnicodeUtil.newString(codePoints, 0, prefixLength);
}

Example #7

Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void setUp() throws Exception {
  super.setUp();
  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();

  builder.add( "aa", "a" );
  builder.add( "bbb", "b" );
  builder.add( "cccc", "cc" );

  builder.add( "h", "i" );
  builder.add( "j", "jj" );
  builder.add( "k", "kkk" );
  builder.add( "ll", "llll" );

  builder.add( "empty", "" );

  // BMP (surrogate pair):
  builder.add(UnicodeUtil.newString(new int[] {0x1D122}, 0, 1), "fclef");

  builder.add("\uff01", "full-width-exclamation");

  normMap = builder.build();
}

Example #8

Source File: TestJapaneseTokenizer.java From lucene-solr with Apache License 2.0

6 votes

/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    if (VERBOSE) {
      System.out.println("\nTEST: iter=" + i);
    }
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}

Example #9

Source File: LowerFunction.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public BytesRef evaluate(Input<Object>... args) {
    Object stringValue = args[0].value();
    if (stringValue == null) {
        return null;
    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];
    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);
    charUtils.toLowerCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];
    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);
    return new BytesRef(res, 0, len);
}

Example #10

Source File: LineContext.java From crate with Apache License 2.0

5 votes

@Nullable
String sourceAsString() {
    if (rawSource != null) {
        char[] chars = new char[rawSource.length];
        int len = UnicodeUtil.UTF8toUTF16(rawSource, 0, rawSource.length, chars);
        return new String(chars, 0, len);
    }
    return null;
}

Example #11

Source File: PartitionName.java From crate with Apache License 2.0

5 votes

/**
 * Read utf8 bytes for bwc, with 0 as `null` indicator
 */
private static String readValueFrom(StreamInput in) throws IOException {
    int length = in.readVInt() - 1;
    if (length == -1) {
        return null;
    }
    if (length == 0) {
        return "";
    }
    byte[] bytes = new byte[length];
    in.readBytes(bytes, 0, length);
    char[] chars = new char[length];
    int len = UnicodeUtil.UTF8toUTF16(bytes, 0, length, chars);
    return new String(chars, 0, len);
}

Example #12

Source File: AutomatonTestUtil.java From lucene-solr with Apache License 2.0

5 votes

/** Returns random string, including full unicode range. */
public static String randomRegexp(Random r) {
  while (true) {
    String regexp = randomRegexpString(r);
    // we will also generate some undefined unicode queries
    if (!UnicodeUtil.validUTF16String(regexp))
      continue;
    try {
      new RegExp(regexp, RegExp.NONE);
      return regexp;
    } catch (Exception e) {}
  }
}

Example #13

Source File: FSTTester.java From lucene-solr with Apache License 2.0

5 votes

static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
  if (!isValidUnicode) {
    return term.toString();
  } else if (inputMode == 0) {
    // utf8
    return toBytesRef(term).utf8ToString() + " " + term;
  } else {
    // utf32
    return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
  }
}

Example #14

Source File: UTF8TaxonomyWriterCache.java From lucene-solr with Apache License 2.0

5 votes

private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}

Example #15

Source File: StringEncoding.java From spliceengine with GNU Affero General Public License v3.0

5 votes

public static int toBytes(String value, boolean desc, byte[] buffer, int offset){
    if(value==null || value.isEmpty()) return 0;

    //convert to UTF-8 encoding
    BytesRef result = new BytesRef();
    UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result);
    for(int i=0;i<result.length;i++){
        byte newD = (byte)(result.bytes[i+result.offset] + 2);
        if(desc)
            newD ^= 0xff; //reverse the sign bit so that data is reversed in 2's complement
        buffer[offset+i] = newD;
    }
    return value.length();
}

Example #16

Source File: LabelledCharArrayMatcher.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Returns a representation of the automaton that matches char[] instead of byte[]
 */
static LabelledCharArrayMatcher wrap(String label, ByteRunAutomaton runAutomaton) {
    return wrap(label, (chars, offset, length) -> {
      int state = 0;
      final int maxIdx = offset + length;
      for (int i = offset; i < maxIdx; i++) {
        final int code = chars[i];
        int b;
        // UTF16 to UTF8   (inlined logic from UnicodeUtil.UTF16toUTF8 )
        if (code < 0x80) {
          state = runAutomaton.step(state, code);
          if (state == -1) return false;
        } else if (code < 0x800) {
          b = (0xC0 | (code >> 6));
          state = runAutomaton.step(state, b);
          if (state == -1) return false;
          b = (0x80 | (code & 0x3F));
          state = runAutomaton.step(state, b);
          if (state == -1) return false;
        } else {
          // more complex
          byte[] utf8Bytes = new byte[4 * (maxIdx - i)];
          int utf8Len = UnicodeUtil.UTF16toUTF8(chars, i, maxIdx - i, utf8Bytes);
          for (int utfIdx = 0; utfIdx < utf8Len; utfIdx++) {
            state = runAutomaton.step(state, utf8Bytes[utfIdx] & 0xFF);
            if (state == -1) return false;
          }
          break;
        }
      }
      return runAutomaton.isAccept(state);
    });
}

Example #17

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

5 votes

public void testAllUnicodeChars() throws Throwable {

    CharsRefBuilder utf16 = new CharsRefBuilder();
    char[] chars = new char[2];
    for(int ch=0;ch<0x0010FFFF;ch++) {

      if (ch == 0xd800)
        // Skip invalid code points
        ch = 0xe000;

      int len = 0;
      if (ch <= 0xffff) {
        chars[len++] = (char) ch;
      } else {
        chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);
        chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);
      }

      BytesRef utf8 = new BytesRef(CharBuffer.wrap(chars, 0, len));

      String s1 = new String(chars, 0, len);
      String s2 = new String(utf8.bytes, 0, utf8.length, StandardCharsets.UTF_8);
      assertEquals("codepoint " + ch, s1, s2);

      utf16.copyUTF8Bytes(utf8.bytes, 0, utf8.length);
      assertEquals("codepoint " + ch, s1, utf16.toString());

      byte[] b = s1.getBytes(StandardCharsets.UTF_8);
      assertEquals(utf8.length, b.length);
      for(int j=0;j<utf8.length;j++)
        assertEquals(utf8.bytes[j], b[j]);
    }
  }

Example #18

Source File: TestExtendedMode.java From lucene-solr with Apache License 2.0

5 votes

/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
  int numIterations = atLeast(500);
  for (int i = 0; i < numIterations; i++) {
    String s = TestUtil.randomUnicodeString(random(), 100);
    try (TokenStream ts = analyzer.tokenStream("foo", s)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        assertTrue(UnicodeUtil.validUTF16String(termAtt));
      }
      ts.end();
    }
  }
}

Example #19

Source File: IndexSizeEstimator.java From lucene-solr with Apache License 2.0

5 votes

/** Process a string field. */
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
  // trim the value if needed
  int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
  if (value.length() > maxLength) {
    value = value.substring(0, maxLength);
  }
  countItem(fieldInfo.name, value, len);
}

Example #20

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

5 votes

private void assertAutomaton(Automaton automaton) throws Exception {
  CharacterRunAutomaton cra = new CharacterRunAutomaton(automaton);
  ByteRunAutomaton bra = new ByteRunAutomaton(automaton);
  final AutomatonTestUtil.RandomAcceptedStrings ras = new AutomatonTestUtil.RandomAcceptedStrings(automaton);
  
  int num = atLeast(1000);
  for (int i = 0; i < num; i++) {
    final String string;
    if (random().nextBoolean()) {
      // likely not accepted
      string = TestUtil.randomUnicodeString(random());
    } else {
      // will be accepted
      int[] codepoints = ras.getRandomAcceptedString(random());
      try {
        string = UnicodeUtil.newString(codepoints, 0, codepoints.length);
      } catch (Exception e) {
        System.out.println(codepoints.length + " codepoints:");
        for(int j=0;j<codepoints.length;j++) {
          System.out.println("  " + Integer.toHexString(codepoints[j]));
        }
        throw e;
      }
    }
    byte bytes[] = string.getBytes(StandardCharsets.UTF_8);
    assertEquals(cra.run(string), bra.run(bytes, 0, bytes.length));
  }
}

Example #21

Source File: PHPSerializedResponseWriter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void writeStr(String name, String val, boolean needsEscaping) throws IOException {
  // serialized PHP strings don't need to be escaped at all, however the 
  // string size reported needs be the number of bytes rather than chars.
  utf8 = ArrayUtil.grow(utf8, val.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
  final int nBytes = UnicodeUtil.UTF16toUTF8(val, 0, val.length(), utf8);

  writer.write("s:");
  writer.write(Integer.toString(nBytes));
  writer.write(":\"");
  writer.write(val);
  writer.write("\";");
}

Example #22

Source File: TestIndexWriterUnicode.java From lucene-solr with Apache License 2.0

4 votes

public void testTermUTF16SortOrder() throws Throwable {
  Random rnd = random();
  Directory dir = newDirectory();
  RandomIndexWriter writer = new RandomIndexWriter(rnd, dir);
  Document d = new Document();
  // Single segment
  Field f = newStringField("f", "", Field.Store.NO);
  d.add(f);
  char[] chars = new char[2];
  final Set<String> allTerms = new HashSet<>();

  int num = atLeast(200);
  for (int i = 0; i < num; i++) {

    final String s;
    if (rnd.nextBoolean()) {
      // Single char
      if (rnd.nextBoolean()) {
        // Above surrogates
        chars[0] = (char) getInt(rnd, 1+UnicodeUtil.UNI_SUR_LOW_END, 0xffff);
      } else {
        // Below surrogates
        chars[0] = (char) getInt(rnd, 0, UnicodeUtil.UNI_SUR_HIGH_START-1);
      }
      s = new String(chars, 0, 1);
    } else {
      // Surrogate pair
      chars[0] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_HIGH_START, UnicodeUtil.UNI_SUR_HIGH_END);
      assertTrue(((int) chars[0]) >= UnicodeUtil.UNI_SUR_HIGH_START && ((int) chars[0]) <= UnicodeUtil.UNI_SUR_HIGH_END);
      chars[1] = (char) getInt(rnd, UnicodeUtil.UNI_SUR_LOW_START, UnicodeUtil.UNI_SUR_LOW_END);
      s = new String(chars, 0, 2);
    }
    allTerms.add(s);
    f.setStringValue(s);

    writer.addDocument(d);

    if ((1+i) % 42 == 0) {
      writer.commit();
    }
  }

  IndexReader r = writer.getReader();

  // Test each sub-segment
  for (LeafReaderContext ctx : r.leaves()) {
    checkTermsOrder(ctx.reader(), allTerms, false);
  }
  checkTermsOrder(r, allTerms, true);

  // Test multi segment
  r.close();

  writer.forceMerge(1);

  // Test single segment
  r = writer.getReader();
  checkTermsOrder(r, allTerms, true);
  r.close();

  writer.close();
  dir.close();
}

Example #23

Source File: SerializerUtil.java From incubator-retired-blur with Apache License 2.0

4 votes

public static void writeString(String s, DataOutput out) throws IOException {
  BytesRef bytes = new BytesRef();
  UnicodeUtil.UTF16toUTF8(s, 0, s.length(), bytes);
  writeBytesRef(bytes, out);
}

Example #24

Source File: RegexMatcher.java From Elasticsearch with Apache License 2.0

4 votes

private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) {
    if (charsRef.chars.length < bytes.length) {
        charsRef.chars = new char[bytes.length];
    }
    charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars);
}

Example #25

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

4 votes

private static boolean isSurrogate(int code) {
  return code >= UnicodeUtil.UNI_SUR_HIGH_START && code <= UnicodeUtil.UNI_SUR_LOW_END;
}

Example #26

Source File: TestUTF32ToUTF8.java From lucene-solr with Apache License 2.0

4 votes

private boolean matches(ByteRunAutomaton a, int code) {
  char[] chars = Character.toChars(code);
  byte[] b = new byte[UnicodeUtil.maxUTF8Length(chars.length)];
  final int len = UnicodeUtil.UTF16toUTF8(chars, 0, chars.length, b);
  return a.run(b, 0, len);
}

Example #27

Source File: TermGroupFacetCollector.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
  if (segmentFacetCounts != null) {
    segmentResults.add(createSegmentResult());
  }

  groupFieldTermsIndex = DocValues.getSorted(context.reader(), groupField);
  facetFieldDocTermOrds = DocValues.getSortedSet(context.reader(), facetField);
  facetFieldNumTerms = (int) facetFieldDocTermOrds.getValueCount();
  if (facetFieldNumTerms == 0) {
    facetOrdTermsEnum = null;
  } else {
    facetOrdTermsEnum = facetFieldDocTermOrds.termsEnum();
  }
  // [facetFieldNumTerms() + 1] for all possible facet values and docs not containing facet field
  segmentFacetCounts = new int[facetFieldNumTerms + 1];
  segmentTotalCount = 0;

  segmentGroupedFacetHits.clear();
  for (GroupedFacetHit groupedFacetHit : groupedFacetHits) {
    int groupOrd = groupedFacetHit.groupValue == null ? -1 : groupFieldTermsIndex.lookupTerm(groupedFacetHit.groupValue);
    if (groupedFacetHit.groupValue != null && groupOrd < 0) {
      continue;
    }

    int facetOrd;
    if (groupedFacetHit.facetValue != null) {
      if (facetOrdTermsEnum == null || !facetOrdTermsEnum.seekExact(groupedFacetHit.facetValue)) {
        continue;
      }
      facetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      facetOrd = facetFieldNumTerms;
    }

    // (facetFieldDocTermOrds.numTerms() + 1) for all possible facet values and docs not containing facet field
    int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd;
    segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);
  }

  if (facetPrefix != null) {
    TermsEnum.SeekStatus seekStatus;
    if (facetOrdTermsEnum != null) {
      seekStatus = facetOrdTermsEnum.seekCeil(facetPrefix);
    } else {
      seekStatus = TermsEnum.SeekStatus.END;
    }

    if (seekStatus != TermsEnum.SeekStatus.END) {
      startFacetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      startFacetOrd = 0;
      endFacetOrd = 0;
      return;
    }

    BytesRefBuilder facetEndPrefix = new BytesRefBuilder();
    facetEndPrefix.append(facetPrefix);
    facetEndPrefix.append(UnicodeUtil.BIG_TERM);
    seekStatus = facetOrdTermsEnum.seekCeil(facetEndPrefix.get());
    if (seekStatus != TermsEnum.SeekStatus.END) {
      endFacetOrd = (int) facetOrdTermsEnum.ord();
    } else {
      endFacetOrd = facetFieldNumTerms; // Don't include null...
    }
  } else {
    startFacetOrd = 0;
    endFacetOrd = facetFieldNumTerms + 1;
  }
}

Example #28

Source File: FuzzyTermsEnum.java From lucene-solr with Apache License 2.0

4 votes

@Override
public BytesRef next() throws IOException {

  if (queuedBottom != null) {
    bottomChanged(queuedBottom);
    queuedBottom = null;
  }
  

  BytesRef term;

  term = actualEnum.next();
  if (term == null) {
    // end
    return null;
  }

  int ed = maxEdits;
    
  // we know the outer DFA always matches.
  // now compute exact edit distance
  while (ed > 0) {
    if (matches(term, ed - 1)) {
      ed--;
    } else {
      break;
    }
  }
    
  if (ed == 0) { // exact match
    boostAtt.setBoost(1.0F);
  } else {
    final int codePointCount = UnicodeUtil.codePointCount(term);
    int minTermLength = Math.min(codePointCount, termLength);

    float similarity = 1.0f - (float) ed / (float) minTermLength;
    boostAtt.setBoost(similarity);
  }
    
  final float bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
  final BytesRef bottomTerm = maxBoostAtt.getCompetitiveTerm();
  if (bottom != this.bottom || bottomTerm != this.bottomTerm) {
    this.bottom = bottom;
    this.bottomTerm = bottomTerm;
    // clone the term before potentially doing something with it
    // this is a rare but wonderful occurrence anyway

    // We must delay bottomChanged until the next next() call otherwise we mess up docFreq(), etc., for the current term:
    queuedBottom = BytesRef.deepCopyOf(term);
  }
  
  return term;
}

Example #29

Source File: TestMappingCharFilter.java From lucene-solr with Apache License 2.0

4 votes

public void testNonBMPChar() throws Exception {
  CharFilter cs = new MappingCharFilter( normMap, new StringReader( UnicodeUtil.newString(new int[] {0x1D122}, 0, 1) ) );
  TokenStream ts =whitespaceMockTokenizer(cs);
  assertTokenStreamContents(ts, new String[]{"fclef"}, new int[]{0}, new int[]{2}, 2);
}

Example #30

Source File: TokenInfoDictionaryTest.java From lucene-solr with Apache License 2.0

4 votes

/** enumerates the entire FST/lookup data and just does basic sanity checks */
public void testEnumerateAll() throws Exception {
  // just for debugging
  int numTerms = 0;
  int numWords = 0;
  int lastWordId = -1;
  int lastSourceId = -1;
  TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
  ConnectionCosts matrix = ConnectionCosts.getInstance();
  FST<Long> fst = tid.getFST().getInternalFST();
  IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
  IntsRefFSTEnum.InputOutput<Long> mapping;
  IntsRef scratch = new IntsRef();
  while ((mapping = fstEnum.next()) != null) {
    numTerms++;
    IntsRef input = mapping.input;
    char[] chars = new char[input.length];
    for (int i = 0; i < chars.length; i++) {
      chars[i] = (char)input.ints[input.offset+i];
    }
    assertTrue(UnicodeUtil.validUTF16String(new String(chars)));

    Long output = mapping.output;
    int sourceId = output.intValue();
    // we walk in order, terms, sourceIds, and wordIds should always be increasing
    assertTrue(sourceId > lastSourceId);
    lastSourceId = sourceId;
    tid.lookupWordIds(sourceId, scratch);
    for (int i = 0; i < scratch.length; i++) {
      numWords++;
      int wordId = scratch.ints[scratch.offset+i];
      assertTrue(wordId > lastWordId);
      lastWordId = wordId;

      String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
      assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));

      String inflectionForm = tid.getInflectionForm(wordId);
      assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
      if (inflectionForm != null) {
        // check that it's actually an ipadic inflection form
        assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));
      }

      String inflectionType = tid.getInflectionType(wordId);
      assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
      if (inflectionType != null) {
        // check that it's actually an ipadic inflection type
        assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
      }

      int leftId = tid.getLeftId(wordId);
      int rightId = tid.getRightId(wordId);

      matrix.get(rightId, leftId);

      tid.getWordCost(wordId);

      String pos = tid.getPartOfSpeech(wordId);
      assertNotNull(pos);
      assertTrue(UnicodeUtil.validUTF16String(pos));
      // check that it's actually an ipadic pos tag
      assertNotNull(ToStringUtil.getPOSTranslation(pos));

      String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
      assertNotNull(pronunciation);
      assertTrue(UnicodeUtil.validUTF16String(pronunciation));

      String reading = tid.getReading(wordId, chars, 0, chars.length);
      assertNotNull(reading);
      assertTrue(UnicodeUtil.validUTF16String(reading));
    }
  }
  if (VERBOSE) {
    System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
  }
}