Java Code Examples for org.apache.parquet.io.api.Binary#length()

The following examples show how to use org.apache.parquet.io.api.Binary#length() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: VarLenEntryDictionaryReader.java    From Bats with Apache License 2.0 6 votes vote down vote up
private final VarLenColumnBulkEntry getEntrySingle(int valsToReadWithinPage) {
  final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader;
  final int[] valueLengths = entry.getValuesLength();
  final Binary currEntry = valueReader.getEntry();
  final int dataLen = currEntry.length();

  // Is there enough memory to handle this large value?
  if (batchMemoryConstraintsReached(0, 4, dataLen)) {
    entry.set(0, 0, 0, 0); // no data to be consumed
    return entry;
  }

  // Set the value length
  valueLengths[0] = dataLen;

  // Now set the bulk entry
  entry.set(0, dataLen, 1, 1, currEntry.getBytes());

  return entry;
}
 
Example 2
Source File: DeltaByteArrayReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public Binary readBytes() {
  int prefixLength = prefixLengthReader.readInteger();
  // This does not copy bytes
  Binary suffix = suffixReader.readBytes();
  int length = prefixLength + suffix.length();

  // NOTE: due to PARQUET-246, it is important that we
  // respect prefixLength which was read from prefixLengthReader,
  // even for the *first* value of a page. Even though the first
  // value of the page should have an empty prefix, it may not
  // because of PARQUET-246.

  // We have to do this to materialize the output
  if(prefixLength != 0) {
    byte[] out = new byte[length];
    System.arraycopy(previous.getBytesUnsafe(), 0, out, 0, prefixLength);
    System.arraycopy(suffix.getBytesUnsafe(), 0, out, prefixLength, suffix.length());
    previous =  Binary.fromConstantByteArray(out);
  } else {
    previous = suffix;
  }
  return previous;
}
 
Example 3
Source File: TestBinaryTruncator.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void checkContract(BinaryTruncator truncator, Comparator<Binary> comparator, Binary value, boolean strictMin,
    boolean strictMax) {
  int length = value.length();

  // Edge cases: returning the original value if no truncation is required
  assertSame(value, truncator.truncateMin(value, length));
  assertSame(value, truncator.truncateMax(value, length));
  assertSame(value, truncator.truncateMin(value, random(length + 1, length * 2 + 1)));
  assertSame(value, truncator.truncateMax(value, random(length + 1, length * 2 + 1)));

  if (length > 1) {
    checkMinContract(truncator, comparator, value, length - 1, strictMin);
    checkMaxContract(truncator, comparator, value, length - 1, strictMax);
    checkMinContract(truncator, comparator, value, random(1, length - 1), strictMin);
    checkMaxContract(truncator, comparator, value, random(1, length - 1), strictMax);
  }

  // Edge case: possible to truncate min value to 0 length if original value is not empty
  checkMinContract(truncator, comparator, value, 0, strictMin);
  // Edge case: impossible to truncate max value to 0 length -> returning the original value
  assertSame(value, truncator.truncateMax(value, 0));
}
 
Example 4
Source File: BinaryColumnReader.java    From presto with Apache License 2.0 6 votes vote down vote up
@Override
protected void readValue(BlockBuilder blockBuilder, Type type)
{
    if (definitionLevel == columnDescriptor.getMaxDefinitionLevel()) {
        Binary binary = valuesReader.readBytes();
        Slice value;
        if (binary.length() == 0) {
            value = EMPTY_SLICE;
        }
        else {
            value = wrappedBuffer(binary.getBytes());
        }
        if (isVarcharType(type)) {
            value = truncateToLength(value, type);
        }
        if (isCharType(type)) {
            value = truncateToLengthAndTrimSpaces(value, type);
        }
        type.writeSlice(blockBuilder, value);
    }
    else if (isValueNull()) {
        blockBuilder.appendNull();
    }
}
 
Example 5
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBinaryDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
  int fallBackThreshold = maxDictionaryByteSize;
  int dataSize=0;
  for (long i = 0; i < 100; i++) {
    Binary binary = Binary.fromString("str" + i);
    cw.writeBytes(binary);
    dataSize += (binary.length() + 4);
    if (dataSize < fallBackThreshold) {
      assertEquals(PLAIN_DICTIONARY, cw.getEncoding());
    } else {
      assertEquals(PLAIN, cw.getEncoding());
    }
  }

  //Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new BinaryPlainValuesReader();
  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (long i = 0; i < 100; i++) {
    assertEquals(Binary.fromString("str" + i), reader.readBytes());
  }

  //simulate cutting the page
  cw.reset();
  assertEquals(0, cw.getBufferedSize());
}
 
Example 6
Source File: PrimitiveStringifier.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
String stringifyNotNull(Binary value) {
  if (value.length() != 12) {
    return BINARY_INVALID;
  }
  ByteBuffer buffer = value.toByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
  int pos = buffer.position();
  String months = UNSIGNED_STRINGIFIER.stringify(buffer.getInt(pos));
  String days = UNSIGNED_STRINGIFIER.stringify(buffer.getInt(pos + 4));
  String millis = UNSIGNED_STRINGIFIER.stringify(buffer.getInt(pos + 8));
  return "interval(" + months + " months, " + days + " days, " + millis + " millis)";
}
 
Example 7
Source File: FixedLenByteArrayPlainValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public final void writeBytes(Binary v) {
  if (v.length() != length) {
    throw new IllegalArgumentException("Fixed Binary size " + v.length() +
        " does not match field type length " + length);
  }
  try {
    v.writeTo(out);
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write fixed bytes", e);
  }
}
 
Example 8
Source File: FallbackValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeBytes(Binary v) {
  //for rawdata, length(4 bytes int) is stored, followed by the binary content itself
  rawDataByteSize += v.length() + 4;
  currentWriter.writeBytes(v);
  checkFallback();
}
 
Example 9
Source File: ParquetGroupConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  final int length = value.length();
  final byte[] bytes = value.getBytes();
  /* set the bytes in LE format in the buffer of decimal vector, we will swap
   * the bytes while writing into the vector.
   */
  writer.writeBigEndianBytesToDecimal(bytes, new ArrowType.Decimal(holder.precision, holder.scale));
  setWritten();
}
 
Example 10
Source File: ParquetGroupConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  if (value.length() > this.varValueSizeLimit) {
    throw createFieldSizeLimitException(value.length(), this.varValueSizeLimit);
  }
  holder.buffer = buf = buf.reallocIfNeeded(value.length());
  buf.setBytes(0, value.toByteBuffer());
  holder.start = 0;
  holder.end = value.length();
  writer.writeVarChar(holder.start, holder.end, holder.buffer);
  setWritten();
}
 
Example 11
Source File: ParquetGroupConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  if (value.length() > this.varValueSizeLimit) {
    throw createFieldSizeLimitException(value.length(), this.varValueSizeLimit);
  }
  holder.buffer = buf = buf.reallocIfNeeded(value.length());
  buf.setBytes(0, value.toByteBuffer());
  holder.start = 0;
  holder.end = value.length();
  writer.writeVarBinary(holder.start, holder.end, holder.buffer);
  setWritten();
}
 
Example 12
Source File: ParquetTimestampUtils.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos).
 *
 * @param timestampBinary INT96 parquet timestamp
 * @return timestamp in millis, GMT timezone
 */
public static long getTimestampMillis(Binary timestampBinary) {
	if (timestampBinary.length() != 12) {
		throw new IllegalArgumentException("Parquet timestamp must be 12 bytes, actual " + timestampBinary.length());
	}
	byte[] bytes = timestampBinary.getBytes();

	// little endian encoding - need to invert byte order
	long timeOfDayNanos = ByteBuffer.wrap(new byte[] {bytes[7], bytes[6], bytes[5], bytes[4],
		bytes[3], bytes[2], bytes[1], bytes[0]}).getLong();
	int julianDay = ByteBuffer.wrap(new byte[] {bytes[11], bytes[10], bytes[9], bytes[8]}).getInt();

	return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
}
 
Example 13
Source File: ParquetTimestampUtils.java    From presto with Apache License 2.0 5 votes vote down vote up
/**
 * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos).
 *
 * @param timestampBinary INT96 parquet timestamp
 * @return timestamp in millis, GMT timezone
 */
public static long getTimestampMillis(Binary timestampBinary)
{
    if (timestampBinary.length() != 12) {
        throw new PrestoException(NOT_SUPPORTED, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length());
    }
    byte[] bytes = timestampBinary.getBytes();

    // little endian encoding - need to invert byte order
    long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
    int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]);

    return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
}
 
Example 14
Source File: DrillParquetGroupConverter.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  holder.buffer = buf.reallocIfNeeded(value.length());
  holder.buffer.setBytes(0, value.toByteBuffer());
  holder.start = 0;
  holder.end = value.length();
  writer.write(holder);
}
 
Example 15
Source File: DrillParquetGroupConverter.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  holder.buffer = buf = buf.reallocIfNeeded(value.length());
  buf.setBytes(0, value.toByteBuffer());
  holder.start = 0;
  holder.end = value.length();
  writer.write(holder);
}
 
Example 16
Source File: DrillParquetGroupConverter.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  holder.buffer = buf = buf.reallocIfNeeded(value.length());
  buf.setBytes(0, value.toByteBuffer());
  holder.start = 0;
  holder.end = value.length();
  writer.write(holder);
}
 
Example 17
Source File: VarLenNullableDictionaryReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
private final VarLenColumnBulkEntry getEntrySingle(int valsToReadWithinPage) {
  final int[] valueLengths = entry.getValuesLength();
  final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader;

  // Initialize the reader if needed
  pageInfo.definitionLevels.readFirstIntegerIfNeeded();

  if (pageInfo.definitionLevels.readCurrInteger() == 1) {
    final Binary currEntry = valueReader.getEntry();
    final int dataLen = currEntry.length();

    // Is there enough memory to handle this large value?
    if (batchMemoryConstraintsReached(1, 4, dataLen)) {
      entry.set(0, 0, 0, 0); // no data to be consumed
      return entry;
    }

    // Set the value length
    valueLengths[0] = dataLen;

    // Now set the bulk entry
    entry.set(0, dataLen, 1, 1, currEntry.getBytes());

  } else {
    valueLengths[0] = -1;

    // Now set the bulk entry
    entry.set(0, 0, 1, 0);
  }

  // read the next definition-level value since we know the current entry has been processed
  pageInfo.definitionLevels.nextIntegerIfNotEOF();

  return entry;
}
 
Example 18
Source File: TestColumnIndexBuilder.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public boolean keep(Binary value) {
  return value != null && value.length() > 0 && value.getBytesUnsafe()[0] == 'B';
}
 
Example 19
Source File: VarLenNullableDictionaryReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
private final VarLenColumnBulkEntry getEntryBulk(int valuesToRead) {
  final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader;
  final int[] valueLengths = entry.getValuesLength();
  final int readBatch = Math.min(entry.getMaxEntries(), valuesToRead);
  Preconditions.checkState(readBatch > 0, "Read batch count [%s] should be greater than zero", readBatch);

  final byte[] tgtBuff = entry.getInternalDataArray();
  final int tgtLen = tgtBuff.length;

  // Counters
  int numValues = 0;
  int numNulls = 0;
  int tgtPos = 0;

  // Initialize the reader if needed
  pageInfo.definitionLevels.readFirstIntegerIfNeeded();

  for (int idx = 0; idx < readBatch; ++idx ) {
    if (pageInfo.definitionLevels.readCurrInteger() == 1) {
      final Binary currEntry = valueReader.getEntry();
      final int dataLen = currEntry.length();

      if (tgtLen < (tgtPos + dataLen)) {
        valueReader.pushBack(currEntry); // push back this value since we're exiting from the loop
        break;
      }

      valueLengths[numValues++] = dataLen;

      if (dataLen > 0) {
        vlCopyNoPadding(currEntry.getBytes(), 0, tgtBuff, tgtPos, dataLen);

        // Update the counters
        tgtPos += dataLen;
      }

    } else {
      valueLengths[numValues++] = -1;
      ++numNulls;
    }

    // read the next definition-level value since we know the current entry has been processed
    pageInfo.definitionLevels.nextIntegerIfNotEOF();
  }

  // We're here either because a) the Parquet metadata is wrong (advertises more values than the real count)
  // or the first value being processed ended up to be too long for the buffer.
  if (numValues == 0) {
    return getEntrySingle(valuesToRead);
  }

  entry.set(0, tgtPos, numValues, numValues - numNulls);

  return entry;
}
 
Example 20
Source File: VarLenEntryDictionaryReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
private final VarLenColumnBulkEntry getEntryBulk(int valuesToRead) {
  final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader;
  final int[] valueLengths = entry.getValuesLength();
  final int readBatch = Math.min(entry.getMaxEntries(), valuesToRead);
  Preconditions.checkState(readBatch > 0, "Read batch count [%s] should be greater than zero", readBatch);

  final byte[] tgtBuff = entry.getInternalDataArray();
  final int tgtLen = tgtBuff.length;

  // Counters
  int numValues = 0;
  int tgtPos = 0;

  for (int idx = 0; idx < readBatch; ++idx ) {
    final Binary currEntry = valueReader.getEntry();
    final int dataLen = currEntry.length();

    if (tgtLen < (tgtPos + dataLen)) {
      valueReader.pushBack(currEntry); // push back this value since we're exiting from the loop
      break;
    }

    valueLengths[numValues++] = dataLen;

    if (dataLen > 0) {
      vlCopyNoPadding(currEntry.getBytes(), 0, tgtBuff, tgtPos, dataLen);

      // Update the counters
      tgtPos += dataLen;
    }
  }

  // We're here either because a) the Parquet metadata is wrong (advertises more values than the real count)
  // or the first value being processed ended up to be too long for the buffer.
  if (numValues == 0) {
    return getEntrySingle(valuesToRead);
  }

  // Now set the bulk entry
  entry.set(0, tgtPos, numValues, numValues);

  return entry;
}