Java Code Examples for org.apache.lucene.util.BytesRefBuilder#grow()

The following examples show how to use org.apache.lucene.util.BytesRefBuilder#grow() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SimpleTextUtil.java From lucene-solr with Apache License 2.0

6 votes

public static void readLine(DataInput in, BytesRefBuilder scratch) throws IOException {
  int upto = 0;
  while(true) {
    byte b = in.readByte();
    scratch.grow(1+upto);
    if (b == ESCAPE) {
      scratch.setByteAt(upto++, in.readByte());
    } else {
      if (b == NEWLINE) {
        break;
      } else {
        scratch.setByteAt(upto++, b);
      }
    }
  }
  scratch.setLength(upto);
}

Example 2

Source File: LegacyNumericUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0. 
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void longToPrefixCoded(final long val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..63
  if ((shift & ~0x3f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..63; got shift=" + shift);
  }
  int nChars = (((63-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(BUF_SIZE_LONG);
  bytes.setByteAt(0, (byte)(SHIFT_START_LONG + shift));
  long sortableBits = val ^ 0x8000000000000000L;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}

Example 3

Source File: LegacyNumericUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0.
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void intToPrefixCoded(final int val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..31
  if ((shift & ~0x1f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..31; got shift=" + shift);
  }
  int nChars = (((31-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(LegacyNumericUtils.BUF_SIZE_LONG);  // use the max
  bytes.setByteAt(0, (byte)(SHIFT_START_INT + shift));
  int sortableBits = val ^ 0x80000000;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}

Example 4

Source File: Correction.java From Elasticsearch with Apache License 2.0

6 votes

public BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef preTag, BytesRef postTag) {
    BytesRef[] toJoin = new BytesRef[this.candidates.length];
    int len = separator.length * this.candidates.length - 1;
    for (int i = 0; i < toJoin.length; i++) {
        Candidate candidate = candidates[i];
        if (preTag == null || candidate.userInput) {
            toJoin[i] = candidate.term;
        } else {
            final int maxLen = preTag.length + postTag.length + candidate.term.length;
            final BytesRefBuilder highlighted = new BytesRefBuilder();// just allocate once
            highlighted.grow(maxLen);
            if (i == 0 || candidates[i-1].userInput) {
                highlighted.append(preTag);
            }
            highlighted.append(candidate.term);
            if (toJoin.length == i + 1 || candidates[i+1].userInput) {
                highlighted.append(postTag);
            }
            toJoin[i] = highlighted.get();
        }
        len += toJoin[i].length;
    }
    result.grow(len);
    return SuggestUtils.join(separator, result, toJoin);
}

Example 5

Source File: CompositeBytesReference.java From crate with Apache License 2.0

5 votes

@Override
public BytesRef toBytesRef() {
    BytesRefBuilder builder = new BytesRefBuilder();
    builder.grow(length());
    BytesRef spare;
    BytesRefIterator iterator = iterator();
    try {
        while ((spare = iterator.next()) != null) {
            builder.append(spare);
        }
    } catch (IOException ex) {
        throw new AssertionError("won't happen", ex); // this is really an error since we don't do IO in our bytesreferences
    }
    return builder.toBytesRef();
}

Example 6

Source File: UTF8TaxonomyWriterCache.java From lucene-solr with Apache License 2.0

5 votes

private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}

Example 7

Source File: Dictionary.java From lucene-solr with Apache License 2.0

5 votes

static void encodeFlags(BytesRefBuilder b, char flags[]) {
  int len = flags.length << 1;
  b.grow(len);
  b.clear();
  for (int i = 0; i < flags.length; i++) {
    int flag = flags[i];
    b.append((byte) ((flag >> 8) & 0xff));
    b.append((byte) (flag & 0xff));
  }
}

Example 8

Source File: EnumFieldType.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  final String s = val.toString();
  if (s == null)
    return;

  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  final Integer intValue = enumMapping.stringValueToIntValue(s);
  NumericUtils.intToSortableBytes(intValue, result.bytes(), 0);
}

Example 9

Source File: EnumFieldType.java From lucene-solr with Apache License 2.0

5 votes

@Override
public String storedToIndexed(IndexableField f) {
  final Number val = f.numericValue();
  if (val == null)
    return null;
  final BytesRefBuilder bytes = new BytesRefBuilder();
  bytes.grow(Integer.BYTES);
  bytes.setLength(Integer.BYTES);
  NumericUtils.intToSortableBytes(val.intValue(), bytes.bytes(), 0);
  return bytes.get().utf8ToString();
}

Example 10

Source File: DatePointField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  Date date = (Date) toNativeType(val.toString());
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(date.getTime(), result.bytes(), 0);
}

Example 11

Source File: SimpleMLTQParser.java From lucene-solr with Apache License 2.0

4 votes

private Term createNumericTerm(String field, String uniqueValue) {
  BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
  bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT);
  LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
  return new Term(field, bytesRefBuilder);
}

Example 12

Source File: DoublePointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Double.BYTES);
  result.setLength(Double.BYTES);
  DoublePoint.encodeDimension(parseDoubleFromUser(null, val.toString()), result.bytes(), 0);
}

Example 13

Source File: LongPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(parseLongFromUser(null, val.toString()), result.bytes(), 0);
}

Example 14

Source File: IntPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  IntPoint.encodeDimension(parseIntFromUser(null, val.toString()), result.bytes(), 0);
}

Example 15

Source File: FloatPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Float.BYTES);
  result.setLength(Float.BYTES);
  FloatPoint.encodeDimension(parseFloatFromUser(null, val.toString()), result.bytes(), 0);
}

Example 16

Source File: CloudMLTQParser.java From lucene-solr with Apache License 2.0

4 votes

private Term createNumericTerm(String field, String uniqueValue) {
  BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
  bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT);
  LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
  return new Term(field, bytesRefBuilder.toBytesRef());
}

Example 17

Source File: TestLucene80DocValuesFormat.java From lucene-solr with Apache License 2.0

4 votes

@Nightly
public void testSortedSetAroundBlockSize() throws IOException {
  final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
  for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
    final Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
    ByteBuffersDataOutput out = new ByteBuffersDataOutput();
    Document doc = new Document();
    SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field1);
    SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field2);
    for (int i = 0; i < maxDoc; ++i) {
      BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      field1.setBytesValue(s1);
      field2.setBytesValue(s2);
      w.addDocument(doc);
      Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
      out.writeVInt(set.size());
      for (BytesRef ref : set) {
        out.writeVInt(ref.length);
        out.writeBytes(ref.bytes, ref.offset, ref.length);
      }
    }

    w.forceMerge(1);
    DirectoryReader r = DirectoryReader.open(w);
    w.close();
    LeafReader sr = getOnlyLeafReader(r);
    assertEquals(maxDoc, sr.maxDoc());
    SortedSetDocValues values = sr.getSortedSetDocValues("sset");
    assertNotNull(values);
    ByteBuffersDataInput in = out.toDataInput();
    BytesRefBuilder b = new BytesRefBuilder();
    for (int i = 0; i < maxDoc; ++i) {
      assertEquals(i, values.nextDoc());
      final int numValues = in.readVInt();

      for (int j = 0; j < numValues; ++j) {
        b.setLength(in.readVInt());
        b.grow(b.length());
        in.readBytes(b.bytes(), 0, b.length());
        assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
      }

      assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
    }
    r.close();
    dir.close();
  }
}

Example 18

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Builds an {@link SynonymMap} and returns it.
 */
public SynonymMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  // TODO: are we using the best sharing options?
  FSTCompiler<BytesRef> fstCompiler =
    new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

  final Set<Integer> dedupSet;

  if (dedup) {
    dedupSet = new HashSet<>();
  } else {
    dedupSet = null;
  }

  final byte[] spare = new byte[5];
  
  Set<CharsRef> keys = workingSet.keySet();
  CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
  Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  
  //System.out.println("fmap.build");
  for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
    CharsRef input = sortedKeys[keyIdx];
    MapEntry output = workingSet.get(input);

    int numEntries = output.ords.size();
    // output size, assume the worst case
    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
    
    scratch.grow(estimatedSize);
    scratchOutput.reset(scratch.bytes());

    // now write our output data:
    int count = 0;
    for (int i = 0; i < numEntries; i++) {
      if (dedupSet != null) {
        // box once
        final Integer ent = output.ords.get(i);
        if (dedupSet.contains(ent)) {
          continue;
        }
        dedupSet.add(ent);
      }
      scratchOutput.writeVInt(output.ords.get(i));   
      count++;
    }

    final int pos = scratchOutput.getPosition();
    scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
    final int pos2 = scratchOutput.getPosition();
    final int vIntLen = pos2-pos;

    // Move the count + includeOrig to the front of the byte[]:
    System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
    System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
    System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

    if (dedupSet != null) {
      dedupSet.clear();
    }
    
    scratch.setLength(scratchOutput.getPosition());
    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
    fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
  }
  
  FST<BytesRef> fst = fstCompiler.compile();
  return new SynonymMap(fst, words, maxHorizontalContext);
}

Example 19

Source File: SimpleTextFieldsReader.java From lucene-solr with Apache License 2.0

4 votes

private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}

Example 20

Source File: TermQueryPrefixTreeStrategy.java From lucene-solr with Apache License 2.0

4 votes

@Override
public Query makeQuery(SpatialArgs args) {
  final SpatialOperation op = args.getOperation();
  if (op != SpatialOperation.Intersects)
    throw new UnsupportedSpatialOperation(op);

  Shape shape = args.getShape();
  int detailLevel = grid.getLevelForDistance(args.resolveDistErr(ctx, distErrPct));

  //--get a List of BytesRef for each term we want (no parents, no leaf bytes))
  final int GUESS_NUM_TERMS;
  if (shape instanceof Point)
    GUESS_NUM_TERMS = detailLevel;//perfect guess
  else
    GUESS_NUM_TERMS = 4096;//should this be a method on SpatialPrefixTree?

  BytesRefBuilder masterBytes = new BytesRefBuilder();//shared byte array for all terms
  List<BytesRef> terms = new ArrayList<>(GUESS_NUM_TERMS);

  CellIterator cells = grid.getTreeCellIterator(shape, detailLevel);
  while (cells.hasNext()) {
    Cell cell = cells.next();
    if (!cell.isLeaf())
      continue;
    BytesRef term = cell.getTokenBytesNoLeaf(null);//null because we want a new BytesRef
    //We copy out the bytes because it may be re-used across the iteration. This also gives us the opportunity
    // to use one contiguous block of memory for the bytes of all terms we need.
    masterBytes.grow(masterBytes.length() + term.length);
    masterBytes.append(term);
    term.bytes = null;//don't need; will reset later
    term.offset = masterBytes.length() - term.length;
    terms.add(term);
  }
  //doing this now because if we did earlier, it's possible the bytes needed to grow()
  for (BytesRef byteRef : terms) {
    byteRef.bytes = masterBytes.bytes();
  }
  //unfortunately TermsQuery will needlessly sort & dedupe
  //TODO an automatonQuery might be faster?
  return new TermInSetQuery(getFieldName(), terms);
}