Java Code Examples for org.apache.lucene.util.IntsRefBuilder#grow()

The following examples show how to use org.apache.lucene.util.IntsRefBuilder#grow() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FSTTester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static IntsRef toIntsRefUTF32(String s, IntsRefBuilder ir) {
  final int charLength = s.length();
  int charIdx = 0;
  int intIdx = 0;
  ir.clear();
  while(charIdx < charLength) {
    ir.grow(intIdx+1);
    final int utf32 = s.codePointAt(charIdx);
    ir.append(utf32);
    charIdx += Character.charCount(utf32);
    intIdx++;
  }
  return ir.get();
}
 
Example 2
Source File: FSTTester.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static IntsRef toIntsRef(BytesRef br, IntsRefBuilder ir) {
  ir.grow(br.length);
  ir.clear();
  for(int i=0;i<br.length;i++) {
    ir.append(br.bytes[br.offset+i]&0xFF);
  }
  return ir.get();
}
 
Example 3
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Just maps each UTF16 unit (char) to the ints in an
 *  IntsRef. */
public static IntsRef toUTF16(CharSequence s, IntsRefBuilder scratch) {
  final int charLimit = s.length();
  scratch.setLength(charLimit);
  scratch.grow(charLimit);
  for (int idx = 0; idx < charLimit; idx++) {
    scratch.setIntAt(idx, (int) s.charAt(idx));
  }
  return scratch.get();
}
 
Example 4
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Decodes the Unicode codepoints from the provided
 *  CharSequence and places them in the provided scratch
 *  IntsRef, which must not be null, returning it. */
public static IntsRef toUTF32(CharSequence s, IntsRefBuilder scratch) {
  int charIdx = 0;
  int intIdx = 0;
  final int charLimit = s.length();
  while(charIdx < charLimit) {
    scratch.grow(intIdx+1);
    final int utf32 = Character.codePointAt(s, charIdx);
    scratch.setIntAt(intIdx, utf32);
    charIdx += Character.charCount(utf32);
    intIdx++;
  }
  scratch.setLength(intIdx);
  return scratch.get();
}
 
Example 5
Source File: Util.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Decodes the Unicode codepoints from the provided
 *  char[] and places them in the provided scratch
 *  IntsRef, which must not be null, returning it. */
public static IntsRef toUTF32(char[] s, int offset, int length, IntsRefBuilder scratch) {
  int charIdx = offset;
  int intIdx = 0;
  final int charLimit = offset + length;
  while(charIdx < charLimit) {
    scratch.grow(intIdx+1);
    final int utf32 = Character.codePointAt(s, charIdx, charLimit);
    scratch.setIntAt(intIdx, utf32);
    charIdx += Character.charCount(utf32);
    intIdx++;
  }
  scratch.setLength(intIdx);
  return scratch.get();
}
 
Example 6
Source File: FstDecompounder.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
/**
 * Splits the input sequence of characters into separate words if this sequence is
 * potentially a compound word.
 *
 * @param word The word to be split.
 * @return Returns <code>null</code> if this word is not recognized at all. Returns a
 * character sequence with '.'-delimited compound chunks (if ambiguous
 * interpretations are possible, they are separated by a ',' character). The
 * returned buffer will change with each call to <code>split</code> so copy the
 * content if needed.
 */
public CharSequence split(CharSequence word) {
    try {
        StringBuilder builder = new StringBuilder();
        builder.append(word);
        builder.reverse();
        for (int i = builder.length(); --i > 0; ) {
            // see https://issues.apache.org/jira/browse/COLLECTIONS-294
            builder.setCharAt(i, Character.toLowerCase(Character.toUpperCase(builder.charAt(i))));
        }
        IntsRefBuilder utf32Builder = new IntsRefBuilder();
        IntsRef utf32 = fromUTF16ToUTF32(builder, utf32Builder).get();
        /*
         * This array stores the minimum number of decomposition words during traversals to
         * avoid splitting a larger word into smaller chunks.
         */
        IntsRefBuilder maxPathsBuilder = new IntsRefBuilder();
        maxPathsBuilder.grow(utf32.length + 1);
        Arrays.fill(maxPathsBuilder.ints(), 0, utf32.length + 1, Integer.MAX_VALUE);
        builder.setLength(0);
        Deque<Chunk> chunks = new LinkedList<>();
        matchWord(utf32, utf32.offset, builder, maxPathsBuilder, chunks);
        return builder.length() == 0 ? null : builder;
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
}
 
Example 7
Source File: UserDictionary.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private UserDictionary(List<String[]> featureEntries) throws IOException {

    int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
    // TODO: should we allow multiple segmentations per input 'phrase'?
    // the old treemap didn't support this either, and i'm not sure if it's needed/useful?

    Collections.sort(featureEntries, new Comparator<String[]>() {
      @Override
      public int compare(String[] left, String[] right) {
        return left[0].compareTo(right[0]);
     }
    });
    
    List<String> data = new ArrayList<>(featureEntries.size());
    List<int[]> segmentations = new ArrayList<>(featureEntries.size());
    
    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    FSTCompiler<Long> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();
    long ord = 0;
    
    for (String[] values : featureEntries) {
      String surface = values[0].replaceAll("\\s", "");
      String concatenatedSegment = values[1].replaceAll("\\s", "");
      String[] segmentation = values[1].replaceAll("  *", " ").split(" ");
      String[] readings = values[2].replaceAll("  *", " ").split(" ");
      String pos = values[3];
      
      if (segmentation.length != readings.length) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the number of segmentations (" + segmentation.length + ")" +
                                   " does not the match number of readings (" + readings.length + ")");
      }

      if (!surface.equals(concatenatedSegment)) {
        throw new RuntimeException("Illegal user dictionary entry " + values[0] +
                                   " - the concatenated segmentation (" + concatenatedSegment + ")" +
                                   " does not match the surface form (" + surface + ")");
      }
      
      int[] wordIdAndLength = new int[segmentation.length + 1]; // wordId offset, length, length....
      wordIdAndLength[0] = wordId;
      for (int i = 0; i < segmentation.length; i++) {
        wordIdAndLength[i + 1] = segmentation[i].length();
        data.add(readings[i] + INTERNAL_SEPARATOR + pos);
        wordId++;
      }
      // add mapping to FST
      String token = values[0];
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstCompiler.add(scratch.get(), ord);
      segmentations.add(wordIdAndLength);
      ord++;
    }
    this.fst = new TokenInfoFST(fstCompiler.compile(), false);
    this.data = data.toArray(new String[data.size()]);
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
  }