Java Code Examples for org.apache.lucene.util.BytesRefBuilder#setLength()

The following examples show how to use org.apache.lucene.util.BytesRefBuilder#setLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SimpleTextUtil.java From lucene-solr with Apache License 2.0

6 votes

public static void readLine(DataInput in, BytesRefBuilder scratch) throws IOException {
  int upto = 0;
  while(true) {
    byte b = in.readByte();
    scratch.grow(1+upto);
    if (b == ESCAPE) {
      scratch.setByteAt(upto++, in.readByte());
    } else {
      if (b == NEWLINE) {
        break;
      } else {
        scratch.setByteAt(upto++, b);
      }
    }
  }
  scratch.setLength(upto);
}

Example 2

Source File: LegacyNumericUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0. 
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void longToPrefixCoded(final long val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..63
  if ((shift & ~0x3f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..63; got shift=" + shift);
  }
  int nChars = (((63-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(BUF_SIZE_LONG);
  bytes.setByteAt(0, (byte)(SHIFT_START_LONG + shift));
  long sortableBits = val ^ 0x8000000000000000L;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}

Example 3

Source File: LegacyNumericUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0.
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void intToPrefixCoded(final int val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..31
  if ((shift & ~0x1f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..31; got shift=" + shift);
  }
  int nChars = (((31-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(LegacyNumericUtils.BUF_SIZE_LONG);  // use the max
  bytes.setByteAt(0, (byte)(SHIFT_START_INT + shift));
  int sortableBits = val ^ 0x80000000;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}

Example 4

Source File: UTF8TaxonomyWriterCache.java From lucene-solr with Apache License 2.0

5 votes

private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}

Example 5

Source File: EnumFieldType.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  final String s = val.toString();
  if (s == null)
    return;

  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  final Integer intValue = enumMapping.stringValueToIntValue(s);
  NumericUtils.intToSortableBytes(intValue, result.bytes(), 0);
}

Example 6

Source File: EnumFieldType.java From lucene-solr with Apache License 2.0

5 votes

@Override
public String storedToIndexed(IndexableField f) {
  final Number val = f.numericValue();
  if (val == null)
    return null;
  final BytesRefBuilder bytes = new BytesRefBuilder();
  bytes.grow(Integer.BYTES);
  bytes.setLength(Integer.BYTES);
  NumericUtils.intToSortableBytes(val.intValue(), bytes.bytes(), 0);
  return bytes.get().utf8ToString();
}

Example 7

Source File: DatePointField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  Date date = (Date) toNativeType(val.toString());
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(date.getTime(), result.bytes(), 0);
}

Example 8

Source File: SimpleTextFieldsReader.java From lucene-solr with Apache License 2.0

4 votes

private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}

Example 9

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Builds an {@link SynonymMap} and returns it.
 */
public SynonymMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  // TODO: are we using the best sharing options?
  FSTCompiler<BytesRef> fstCompiler =
    new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

  final Set<Integer> dedupSet;

  if (dedup) {
    dedupSet = new HashSet<>();
  } else {
    dedupSet = null;
  }

  final byte[] spare = new byte[5];
  
  Set<CharsRef> keys = workingSet.keySet();
  CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
  Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  
  //System.out.println("fmap.build");
  for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
    CharsRef input = sortedKeys[keyIdx];
    MapEntry output = workingSet.get(input);

    int numEntries = output.ords.size();
    // output size, assume the worst case
    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
    
    scratch.grow(estimatedSize);
    scratchOutput.reset(scratch.bytes());

    // now write our output data:
    int count = 0;
    for (int i = 0; i < numEntries; i++) {
      if (dedupSet != null) {
        // box once
        final Integer ent = output.ords.get(i);
        if (dedupSet.contains(ent)) {
          continue;
        }
        dedupSet.add(ent);
      }
      scratchOutput.writeVInt(output.ords.get(i));   
      count++;
    }

    final int pos = scratchOutput.getPosition();
    scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
    final int pos2 = scratchOutput.getPosition();
    final int vIntLen = pos2-pos;

    // Move the count + includeOrig to the front of the byte[]:
    System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
    System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
    System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

    if (dedupSet != null) {
      dedupSet.clear();
    }
    
    scratch.setLength(scratchOutput.getPosition());
    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
    fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
  }
  
  FST<BytesRef> fst = fstCompiler.compile();
  return new SynonymMap(fst, words, maxHorizontalContext);
}

Example 10

Source File: TestLucene80DocValuesFormat.java From lucene-solr with Apache License 2.0

4 votes

@Nightly
public void testSortedSetAroundBlockSize() throws IOException {
  final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
  for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
    final Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
    ByteBuffersDataOutput out = new ByteBuffersDataOutput();
    Document doc = new Document();
    SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field1);
    SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field2);
    for (int i = 0; i < maxDoc; ++i) {
      BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      field1.setBytesValue(s1);
      field2.setBytesValue(s2);
      w.addDocument(doc);
      Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
      out.writeVInt(set.size());
      for (BytesRef ref : set) {
        out.writeVInt(ref.length);
        out.writeBytes(ref.bytes, ref.offset, ref.length);
      }
    }

    w.forceMerge(1);
    DirectoryReader r = DirectoryReader.open(w);
    w.close();
    LeafReader sr = getOnlyLeafReader(r);
    assertEquals(maxDoc, sr.maxDoc());
    SortedSetDocValues values = sr.getSortedSetDocValues("sset");
    assertNotNull(values);
    ByteBuffersDataInput in = out.toDataInput();
    BytesRefBuilder b = new BytesRefBuilder();
    for (int i = 0; i < maxDoc; ++i) {
      assertEquals(i, values.nextDoc());
      final int numValues = in.readVInt();

      for (int j = 0; j < numValues; ++j) {
        b.setLength(in.readVInt());
        b.grow(b.length());
        in.readBytes(b.bytes(), 0, b.length());
        assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
      }

      assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
    }
    r.close();
    dir.close();
  }
}

Example 11

Source File: FloatPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Float.BYTES);
  result.setLength(Float.BYTES);
  FloatPoint.encodeDimension(parseFloatFromUser(null, val.toString()), result.bytes(), 0);
}

Example 12

Source File: IntPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  IntPoint.encodeDimension(parseIntFromUser(null, val.toString()), result.bytes(), 0);
}

Example 13

Source File: LongPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(parseLongFromUser(null, val.toString()), result.bytes(), 0);
}

Example 14

Source File: DoublePointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Double.BYTES);
  result.setLength(Double.BYTES);
  DoublePoint.encodeDimension(parseDoubleFromUser(null, val.toString()), result.bytes(), 0);
}