Java Code Examples for org.apache.lucene.util.BytesRefBuilder#grow()

The following examples show how to use org.apache.lucene.util.BytesRefBuilder#grow() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimpleTextUtil.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public static void readLine(DataInput in, BytesRefBuilder scratch) throws IOException {
  int upto = 0;
  while(true) {
    byte b = in.readByte();
    scratch.grow(1+upto);
    if (b == ESCAPE) {
      scratch.setByteAt(upto++, in.readByte());
    } else {
      if (b == NEWLINE) {
        break;
      } else {
        scratch.setByteAt(upto++, b);
      }
    }
  }
  scratch.setLength(upto);
}
 
Example 2
Source File: LegacyNumericUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0. 
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void longToPrefixCoded(final long val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..63
  if ((shift & ~0x3f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..63; got shift=" + shift);
  }
  int nChars = (((63-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(BUF_SIZE_LONG);
  bytes.setByteAt(0, (byte)(SHIFT_START_LONG + shift));
  long sortableBits = val ^ 0x8000000000000000L;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}
 
Example 3
Source File: LegacyNumericUtils.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0.
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void intToPrefixCoded(final int val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..31
  if ((shift & ~0x1f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..31; got shift=" + shift);
  }
  int nChars = (((31-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(LegacyNumericUtils.BUF_SIZE_LONG);  // use the max
  bytes.setByteAt(0, (byte)(SHIFT_START_INT + shift));
  int sortableBits = val ^ 0x80000000;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}
 
Example 4
Source File: Correction.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef preTag, BytesRef postTag) {
    BytesRef[] toJoin = new BytesRef[this.candidates.length];
    int len = separator.length * this.candidates.length - 1;
    for (int i = 0; i < toJoin.length; i++) {
        Candidate candidate = candidates[i];
        if (preTag == null || candidate.userInput) {
            toJoin[i] = candidate.term;
        } else {
            final int maxLen = preTag.length + postTag.length + candidate.term.length;
            final BytesRefBuilder highlighted = new BytesRefBuilder();// just allocate once
            highlighted.grow(maxLen);
            if (i == 0 || candidates[i-1].userInput) {
                highlighted.append(preTag);
            }
            highlighted.append(candidate.term);
            if (toJoin.length == i + 1 || candidates[i+1].userInput) {
                highlighted.append(postTag);
            }
            toJoin[i] = highlighted.get();
        }
        len += toJoin[i].length;
    }
    result.grow(len);
    return SuggestUtils.join(separator, result, toJoin);
}
 
Example 5
Source File: CompositeBytesReference.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public BytesRef toBytesRef() {
    BytesRefBuilder builder = new BytesRefBuilder();
    builder.grow(length());
    BytesRef spare;
    BytesRefIterator iterator = iterator();
    try {
        while ((spare = iterator.next()) != null) {
            builder.append(spare);
        }
    } catch (IOException ex) {
        throw new AssertionError("won't happen", ex); // this is really an error since we don't do IO in our bytesreferences
    }
    return builder.toBytesRef();
}
 
Example 6
Source File: UTF8TaxonomyWriterCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}
 
Example 7
Source File: Dictionary.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
static void encodeFlags(BytesRefBuilder b, char flags[]) {
  int len = flags.length << 1;
  b.grow(len);
  b.clear();
  for (int i = 0; i < flags.length; i++) {
    int flag = flags[i];
    b.append((byte) ((flag >> 8) & 0xff));
    b.append((byte) (flag & 0xff));
  }
}
 
Example 8
Source File: EnumFieldType.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  final String s = val.toString();
  if (s == null)
    return;

  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  final Integer intValue = enumMapping.stringValueToIntValue(s);
  NumericUtils.intToSortableBytes(intValue, result.bytes(), 0);
}
 
Example 9
Source File: EnumFieldType.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public String storedToIndexed(IndexableField f) {
  final Number val = f.numericValue();
  if (val == null)
    return null;
  final BytesRefBuilder bytes = new BytesRefBuilder();
  bytes.grow(Integer.BYTES);
  bytes.setLength(Integer.BYTES);
  NumericUtils.intToSortableBytes(val.intValue(), bytes.bytes(), 0);
  return bytes.get().utf8ToString();
}
 
Example 10
Source File: DatePointField.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  Date date = (Date) toNativeType(val.toString());
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(date.getTime(), result.bytes(), 0);
}
 
Example 11
Source File: SimpleMLTQParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private Term createNumericTerm(String field, String uniqueValue) {
  BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
  bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT);
  LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
  return new Term(field, bytesRefBuilder);
}
 
Example 12
Source File: DoublePointField.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Double.BYTES);
  result.setLength(Double.BYTES);
  DoublePoint.encodeDimension(parseDoubleFromUser(null, val.toString()), result.bytes(), 0);
}
 
Example 13
Source File: LongPointField.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(parseLongFromUser(null, val.toString()), result.bytes(), 0);
}
 
Example 14
Source File: IntPointField.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  IntPoint.encodeDimension(parseIntFromUser(null, val.toString()), result.bytes(), 0);
}
 
Example 15
Source File: FloatPointField.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Float.BYTES);
  result.setLength(Float.BYTES);
  FloatPoint.encodeDimension(parseFloatFromUser(null, val.toString()), result.bytes(), 0);
}
 
Example 16
Source File: CloudMLTQParser.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private Term createNumericTerm(String field, String uniqueValue) {
  BytesRefBuilder bytesRefBuilder = new BytesRefBuilder();
  bytesRefBuilder.grow(LegacyNumericUtils.BUF_SIZE_INT);
  LegacyNumericUtils.intToPrefixCoded(Integer.parseInt(uniqueValue), 0, bytesRefBuilder);
  return new Term(field, bytesRefBuilder.toBytesRef());
}
 
Example 17
Source File: TestLucene80DocValuesFormat.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Nightly
public void testSortedSetAroundBlockSize() throws IOException {
  final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
  for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
    final Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
    ByteBuffersDataOutput out = new ByteBuffersDataOutput();
    Document doc = new Document();
    SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field1);
    SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field2);
    for (int i = 0; i < maxDoc; ++i) {
      BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      field1.setBytesValue(s1);
      field2.setBytesValue(s2);
      w.addDocument(doc);
      Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
      out.writeVInt(set.size());
      for (BytesRef ref : set) {
        out.writeVInt(ref.length);
        out.writeBytes(ref.bytes, ref.offset, ref.length);
      }
    }

    w.forceMerge(1);
    DirectoryReader r = DirectoryReader.open(w);
    w.close();
    LeafReader sr = getOnlyLeafReader(r);
    assertEquals(maxDoc, sr.maxDoc());
    SortedSetDocValues values = sr.getSortedSetDocValues("sset");
    assertNotNull(values);
    ByteBuffersDataInput in = out.toDataInput();
    BytesRefBuilder b = new BytesRefBuilder();
    for (int i = 0; i < maxDoc; ++i) {
      assertEquals(i, values.nextDoc());
      final int numValues = in.readVInt();

      for (int j = 0; j < numValues; ++j) {
        b.setLength(in.readVInt());
        b.grow(b.length());
        in.readBytes(b.bytes(), 0, b.length());
        assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
      }

      assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
    }
    r.close();
    dir.close();
  }
}
 
Example 18
Source File: SynonymMap.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
/**
 * Builds an {@link SynonymMap} and returns it.
 */
public SynonymMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  // TODO: are we using the best sharing options?
  FSTCompiler<BytesRef> fstCompiler =
    new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

  final Set<Integer> dedupSet;

  if (dedup) {
    dedupSet = new HashSet<>();
  } else {
    dedupSet = null;
  }

  final byte[] spare = new byte[5];
  
  Set<CharsRef> keys = workingSet.keySet();
  CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
  Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  
  //System.out.println("fmap.build");
  for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
    CharsRef input = sortedKeys[keyIdx];
    MapEntry output = workingSet.get(input);

    int numEntries = output.ords.size();
    // output size, assume the worst case
    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
    
    scratch.grow(estimatedSize);
    scratchOutput.reset(scratch.bytes());

    // now write our output data:
    int count = 0;
    for (int i = 0; i < numEntries; i++) {
      if (dedupSet != null) {
        // box once
        final Integer ent = output.ords.get(i);
        if (dedupSet.contains(ent)) {
          continue;
        }
        dedupSet.add(ent);
      }
      scratchOutput.writeVInt(output.ords.get(i));   
      count++;
    }

    final int pos = scratchOutput.getPosition();
    scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
    final int pos2 = scratchOutput.getPosition();
    final int vIntLen = pos2-pos;

    // Move the count + includeOrig to the front of the byte[]:
    System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
    System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
    System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

    if (dedupSet != null) {
      dedupSet.clear();
    }
    
    scratch.setLength(scratchOutput.getPosition());
    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
    fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
  }
  
  FST<BytesRef> fst = fstCompiler.compile();
  return new SynonymMap(fst, words, maxHorizontalContext);
}
 
Example 19
Source File: SimpleTextFieldsReader.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}
 
Example 20
Source File: TermQueryPrefixTreeStrategy.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public Query makeQuery(SpatialArgs args) {
  final SpatialOperation op = args.getOperation();
  if (op != SpatialOperation.Intersects)
    throw new UnsupportedSpatialOperation(op);

  Shape shape = args.getShape();
  int detailLevel = grid.getLevelForDistance(args.resolveDistErr(ctx, distErrPct));

  //--get a List of BytesRef for each term we want (no parents, no leaf bytes))
  final int GUESS_NUM_TERMS;
  if (shape instanceof Point)
    GUESS_NUM_TERMS = detailLevel;//perfect guess
  else
    GUESS_NUM_TERMS = 4096;//should this be a method on SpatialPrefixTree?

  BytesRefBuilder masterBytes = new BytesRefBuilder();//shared byte array for all terms
  List<BytesRef> terms = new ArrayList<>(GUESS_NUM_TERMS);

  CellIterator cells = grid.getTreeCellIterator(shape, detailLevel);
  while (cells.hasNext()) {
    Cell cell = cells.next();
    if (!cell.isLeaf())
      continue;
    BytesRef term = cell.getTokenBytesNoLeaf(null);//null because we want a new BytesRef
    //We copy out the bytes because it may be re-used across the iteration. This also gives us the opportunity
    // to use one contiguous block of memory for the bytes of all terms we need.
    masterBytes.grow(masterBytes.length() + term.length);
    masterBytes.append(term);
    term.bytes = null;//don't need; will reset later
    term.offset = masterBytes.length() - term.length;
    terms.add(term);
  }
  //doing this now because if we did earlier, it's possible the bytes needed to grow()
  for (BytesRef byteRef : terms) {
    byteRef.bytes = masterBytes.bytes();
  }
  //unfortunately TermsQuery will needlessly sort & dedupe
  //TODO an automatonQuery might be faster?
  return new TermInSetQuery(getFieldName(), terms);
}