org.apache.lucene.store.ByteArrayDataOutput Java Examples

The following examples show how to use org.apache.lucene.store.ByteArrayDataOutput. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: BinaryFieldMapper.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public BytesRef binaryValue() {
    try {
        CollectionUtils.sortAndDedup(bytesList);
        int size = bytesList.size();
        final byte[] bytes = new byte[totalSize + (size + 1) * 5];
        ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
        out.writeVInt(size);  // write total number of values
        for (int i = 0; i < size; i ++) {
            final byte[] value = bytesList.get(i);
            int valueLength = value.length;
            out.writeVInt(valueLength);
            out.writeBytes(value, 0, valueLength);
        }
        return new BytesRef(bytes, 0, out.getPosition());
    } catch (IOException e) {
        throw new ElasticsearchException("Failed to get binary value", e);
    }

}

Example #2

Source File: NumberFieldMapper.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public BytesRef binaryValue() {
    CollectionUtils.sortAndDedup(values);

    // here is the trick:
    //  - the first value is zig-zag encoded so that eg. -5 would become positive and would be better compressed by vLong
    //  - for other values, we only encode deltas using vLong
    final byte[] bytes = new byte[values.size() * ByteUtils.MAX_BYTES_VLONG];
    final ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
    ByteUtils.writeVLong(out, ByteUtils.zigZagEncode(values.get(0)));
    for (int i = 1; i < values.size(); ++i) {
        final long delta = values.get(i) - values.get(i - 1);
        ByteUtils.writeVLong(out, delta);
    }
    return new BytesRef(bytes, 0, out.getPosition());
}

Example #3

Source File: SortedInputIterator.java From lucene-solr with Apache License 2.0

6 votes

private ByteSequencesReader sort() throws IOException {

    OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, tieBreakByCostComparator);
    tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
    
    try (OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput)) {
      BytesRef spare;
      byte[] buffer = new byte[0];
      ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);

      while ((spare = source.next()) != null) {
        encode(writer, output, buffer, spare, source.payload(), source.contexts(), source.weight());
      }
      CodecUtil.writeFooter(tempInput);
    }

    tempSortedFileName = sorter.sort(tempInput.getName());
    return new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
  }

Example #4

Source File: TestIndexInput.java From lucene-solr with Apache License 2.0

6 votes

@BeforeClass
public static void beforeClass() throws IOException {
  Random random = random();
  INTS = new int[COUNT];
  LONGS = new long[COUNT];
  RANDOM_TEST_BYTES = new byte[COUNT * (5 + 4 + 9 + 8)];
  final ByteArrayDataOutput bdo = new ByteArrayDataOutput(RANDOM_TEST_BYTES);
  for (int i = 0; i < COUNT; i++) {
    final int i1 = INTS[i] = random.nextInt();
    bdo.writeVInt(i1);
    bdo.writeInt(i1);

    final long l1;
    if (rarely()) {
      // a long with lots of zeroes at the end
      l1 = LONGS[i] = TestUtil.nextLong(random, 0, Integer.MAX_VALUE) << 32;
    } else {
      l1 = LONGS[i] = TestUtil.nextLong(random, 0, Long.MAX_VALUE);
    }
    bdo.writeVLong(l1);
    bdo.writeLong(l1);
  }
}

Example #5

Source File: MinHashFieldMapper.java From elasticsearch-minhash with Apache License 2.0

6 votes

@Override
public BytesRef binaryValue() {
    try {
        CollectionUtils.sortAndDedup(bytesList);
        final int size = bytesList.size();
        final byte[] bytes = new byte[totalSize + (size + 1) * 5];
        final ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
        out.writeVInt(size); // write total number of values
        for (int i = 0; i < size; i++) {
            final byte[] value = bytesList.get(i);
            final int valueLength = value.length;
            out.writeVInt(valueLength);
            out.writeBytes(value, 0, valueLength);
        }
        return new BytesRef(bytes, 0, out.getPosition());
    } catch (final IOException e) {
        throw new ElasticsearchException("Failed to get MinHash value",
                e);
    }

}

Example #6

Source File: WFSTCompletionLookup.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected void encode(ByteSequencesWriter writer, ByteArrayDataOutput output, byte[] buffer, BytesRef spare, BytesRef payload, Set<BytesRef> contexts, long weight) throws IOException {
  if (spare.length + 4 >= buffer.length) {
    buffer = ArrayUtil.grow(buffer, spare.length + 4);
  }
  output.reset(buffer);
  output.writeBytes(spare.bytes, spare.offset, spare.length);
  output.writeInt(encodeWeight(weight));
  writer.write(buffer, 0, output.getPosition());
}

Example #7

Source File: NRTSuggester.java From lucene-solr with Apache License 2.0

5 votes

static BytesRef make(final BytesRef surface, int docID, int payloadSep) throws IOException {
  int len = surface.length + MAX_DOC_ID_LEN_WITH_SEP;
  byte[] buffer = new byte[len];
  ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
  output.writeBytes(surface.bytes, surface.length - surface.offset);
  output.writeByte((byte) payloadSep);
  output.writeVInt(docID);
  return new BytesRef(buffer, 0, output.getPosition());
}

Example #8

Source File: AbstractTestCompressionMode.java From lucene-solr with Apache License 2.0

5 votes

static byte[] compress(Compressor compressor, byte[] decompressed, int off, int len) throws IOException {
  byte[] compressed = new byte[len * 2 + 16]; // should be enough
  ByteArrayDataOutput out = new ByteArrayDataOutput(compressed);
  compressor.compress(decompressed, off, len, out);
  final int compressedLen = out.getPosition();
  return ArrayUtil.copyOfSubArray(compressed, 0, compressedLen);
}

Example #9

Source File: Checkpoint.java From Elasticsearch with Apache License 2.0

4 votes

private void write(FileChannel channel) throws IOException {
    byte[] buffer = new byte[BUFFER_SIZE];
    final ByteArrayDataOutput out = new ByteArrayDataOutput(buffer);
    write(out);
    Channels.writeToChannel(buffer, channel);
}

Example #10

Source File: FSTCompletionLookup.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void build(InputIterator iterator) throws IOException {
  if (iterator.hasPayloads()) {
    throw new IllegalArgumentException("this suggester doesn't support payloads");
  }
  if (iterator.hasContexts()) {
    throw new IllegalArgumentException("this suggester doesn't support contexts");
  }

  OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix);
  ExternalRefSorter externalSorter = new ExternalRefSorter(sorter);
  IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT);
  String tempSortedFileName = null;

  OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
  OfflineSorter.ByteSequencesReader reader = null;

  // Push floats up front before sequences to sort them. For now, assume they are non-negative.
  // If negative floats are allowed some trickery needs to be done to find their byte order.
  count = 0;
  try {
    byte [] buffer = new byte [0];
    ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
    BytesRef spare;
    int inputLineCount = 0;
    while ((spare = iterator.next()) != null) {
      if (spare.length + 4 >= buffer.length) {
        buffer = ArrayUtil.grow(buffer, spare.length + 4);
      }

      output.reset(buffer);
      output.writeInt(encodeWeight(iterator.weight()));
      output.writeBytes(spare.bytes, spare.offset, spare.length);
      writer.write(buffer, 0, output.getPosition());
      inputLineCount++;
    }
    CodecUtil.writeFooter(tempInput);
    writer.close();

    // We don't know the distribution of scores and we need to bucket them, so we'll sort
    // and divide into equal buckets.
    tempSortedFileName = sorter.sort(tempInput.getName());
    tempDir.deleteFile(tempInput.getName());

    FSTCompletionBuilder builder = new FSTCompletionBuilder(
        buckets, externalSorter, sharedTailLength);

    reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName);
    long line = 0;
    int previousBucket = 0;
    int previousScore = 0;
    ByteArrayDataInput input = new ByteArrayDataInput();
    BytesRef tmp2 = new BytesRef();
    while (true) {
      BytesRef scratch = reader.next();
      if (scratch == null) {
        break;
      }
      input.reset(scratch.bytes, scratch.offset, scratch.length);
      int currentScore = input.readInt();

      int bucket;
      if (line > 0 && currentScore == previousScore) {
        bucket = previousBucket;
      } else {
        bucket = (int) (line * buckets / inputLineCount);
      }
      previousScore = currentScore;
      previousBucket = bucket;

      // Only append the input, discard the weight.
      tmp2.bytes = scratch.bytes;
      tmp2.offset = scratch.offset + input.getPosition();
      tmp2.length = scratch.length - input.getPosition();
      builder.add(tmp2, bucket);

      line++;
      count++;
    }

    // The two FSTCompletions share the same automaton.
    this.higherWeightsCompletion = builder.build();
    this.normalCompletion = new FSTCompletion(
        higherWeightsCompletion.getFST(), false, exactMatchFirst);
    
  } finally {
    IOUtils.closeWhileHandlingException(reader, writer, externalSorter);
    IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName);
  }
}

Example #11

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Builds an {@link SynonymMap} and returns it.
 */
public SynonymMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  // TODO: are we using the best sharing options?
  FSTCompiler<BytesRef> fstCompiler =
    new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

  final Set<Integer> dedupSet;

  if (dedup) {
    dedupSet = new HashSet<>();
  } else {
    dedupSet = null;
  }

  final byte[] spare = new byte[5];
  
  Set<CharsRef> keys = workingSet.keySet();
  CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
  Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  
  //System.out.println("fmap.build");
  for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
    CharsRef input = sortedKeys[keyIdx];
    MapEntry output = workingSet.get(input);

    int numEntries = output.ords.size();
    // output size, assume the worst case
    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
    
    scratch.grow(estimatedSize);
    scratchOutput.reset(scratch.bytes());

    // now write our output data:
    int count = 0;
    for (int i = 0; i < numEntries; i++) {
      if (dedupSet != null) {
        // box once
        final Integer ent = output.ords.get(i);
        if (dedupSet.contains(ent)) {
          continue;
        }
        dedupSet.add(ent);
      }
      scratchOutput.writeVInt(output.ords.get(i));   
      count++;
    }

    final int pos = scratchOutput.getPosition();
    scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
    final int pos2 = scratchOutput.getPosition();
    final int vIntLen = pos2-pos;

    // Move the count + includeOrig to the front of the byte[]:
    System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
    System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
    System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

    if (dedupSet != null) {
      dedupSet.clear();
    }
    
    scratch.setLength(scratchOutput.getPosition());
    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
    fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
  }
  
  FST<BytesRef> fst = fstCompiler.compile();
  return new SynonymMap(fst, words, maxHorizontalContext);
}

Example #12

Source File: Test2BBinaryDocValues.java From lucene-solr with Apache License 2.0

4 votes

public void testVariableBinary() throws Exception {
  BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BVariableBinary"));
  if (dir instanceof MockDirectoryWrapper) {
    ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
  }
  
  IndexWriter w = new IndexWriter(dir,
      new IndexWriterConfig(new MockAnalyzer(random()))
      .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
      .setRAMBufferSizeMB(256.0)
      .setMergeScheduler(new ConcurrentMergeScheduler())
      .setMergePolicy(newLogMergePolicy(false, 10))
      .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
      .setCodec(TestUtil.getDefaultCodec()));

  Document doc = new Document();
  byte bytes[] = new byte[4];
  ByteArrayDataOutput encoder = new ByteArrayDataOutput(bytes);
  BytesRef data = new BytesRef(bytes);
  BinaryDocValuesField dvField = new BinaryDocValuesField("dv", data);
  doc.add(dvField);
  
  for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
    encoder.reset(bytes);
    encoder.writeVInt(i % 65535); // 1, 2, or 3 bytes
    data.length = encoder.getPosition();
    w.addDocument(doc);
    if (i % 100000 == 0) {
      System.out.println("indexed: " + i);
      System.out.flush();
    }
  }
  
  w.forceMerge(1);
  w.close();
  
  System.out.println("verifying...");
  System.out.flush();
  
  DirectoryReader r = DirectoryReader.open(dir);
  int expectedValue = 0;
  ByteArrayDataInput input = new ByteArrayDataInput();
  for (LeafReaderContext context : r.leaves()) {
    LeafReader reader = context.reader();
    BinaryDocValues dv = reader.getBinaryDocValues("dv");
    for (int i = 0; i < reader.maxDoc(); i++) {
      assertEquals(i, dv.nextDoc());
      final BytesRef term = dv.binaryValue();
      input.reset(term.bytes, term.offset, term.length);
      assertEquals(expectedValue % 65535, input.readVInt());
      assertTrue(input.eof());
      expectedValue++;
    }
  }
  
  r.close();
  dir.close();
}