Java Code Examples for org.apache.parquet.io.api.Binary#fromReusedByteArray()

The following examples show how to use org.apache.parquet.io.api.Binary#fromReusedByteArray() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testBinaryMinMaxForReusedBackingByteArray() {
  BinaryStatistics stats = new BinaryStatistics();

  byte[] bytes = new byte[] { 10 };
  final Binary value = Binary.fromReusedByteArray(bytes);
  stats.updateStats(value);

  bytes[0] = 20;
  stats.updateStats(value);

  bytes[0] = 15;
  stats.updateStats(value);

  assertArrayEquals(new byte[] { 20 }, stats.getMaxBytes());
  assertArrayEquals(new byte[] { 10 }, stats.getMinBytes());
}
 
Example 2
Source File: BinaryDictionary.java    From presto with Apache License 2.0 5 votes vote down vote up
public BinaryDictionary(DictionaryPage dictionaryPage, Integer length)
        throws IOException
{
    super(dictionaryPage.getEncoding());
    content = new Binary[dictionaryPage.getDictionarySize()];

    byte[] dictionaryBytes;
    int offset;
    Slice dictionarySlice = dictionaryPage.getSlice();
    if (dictionarySlice.hasByteArray()) {
        dictionaryBytes = dictionarySlice.byteArray();
        offset = dictionarySlice.byteArrayOffset();
    }
    else {
        dictionaryBytes = dictionarySlice.getBytes();
        offset = 0;
    }

    if (length == null) {
        for (int i = 0; i < content.length; i++) {
            int len = readIntLittleEndian(dictionaryBytes, offset);
            offset += 4;
            content[i] = Binary.fromReusedByteArray(dictionaryBytes, offset, len);
            offset += len;
        }
    }
    else {
        checkArgument(length > 0, "Invalid byte array length: %s", length);
        for (int i = 0; i < content.length; i++) {
            content[i] = Binary.fromReusedByteArray(dictionaryBytes, offset, length);
            offset += length;
        }
    }
}
 
Example 3
Source File: AvroWriteSupportInt96Avro18.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private Binary fromAvroString(Object value) {
  if (value instanceof Utf8) {
    Utf8 utf8 = (Utf8) value;
    return Binary.fromReusedByteArray(utf8.getBytes(), 0, utf8.getByteLength());
  }
  return Binary.fromCharSequence((CharSequence) value);
}
 
Example 4
Source File: AvroWriteSupportInt96Avro17.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private Binary fromAvroString(Object value) {
  if (value instanceof Utf8) {
    Utf8 utf8 = (Utf8) value;
    return Binary.fromReusedByteArray(utf8.getBytes(), 0, utf8.getByteLength());
  }
  return Binary.fromString(value.toString());
}
 
Example 5
Source File: BinaryStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Sets min and max values, re-uses the byte[] passed in.
 * Any changes made to byte[] will be reflected in min and max values as well.
 * @param minBytes byte array to set the min value to
 * @param maxBytes byte array to set the max value to
 */
@Override
public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {
  max = Binary.fromReusedByteArray(maxBytes);
  min = Binary.fromReusedByteArray(minBytes);
  this.markAsNotEmpty();
}
 
Example 6
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
  Binary reused = Binary.fromReusedByteArray((prefix + "0").getBytes(StandardCharsets.UTF_8));
  for (int i = 0; i < COUNT; i++) {
    Binary content = Binary.fromString(prefix + i % 10);
    System.arraycopy(content.getBytesUnsafe(), 0, reused.getBytesUnsafe(), 0, reused.length());
    cw.writeBytes(reused);
  }
}
 
Example 7
Source File: RandomValues.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Binary nextValue() {
  // use a random length, but ensure it is at least a few bytes
  int length = 5 + randomPositiveInt(buffer.length - 5);
  for (int index = 0; index < length; index++) {
    buffer[index] = (byte) randomInt();
  }

  return Binary.fromReusedByteArray(buffer, 0, length);
}
 
Example 8
Source File: RandomValues.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Binary nextValue() {
  for (int index = 0; index < buffer.length; index++) {
    buffer[index] = (byte) randomInt();
  }

  return Binary.fromReusedByteArray(buffer);
}
 
Example 9
Source File: AvroWriteSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Binary fromAvroString(Object value) {
  if (value instanceof Utf8) {
    Utf8 utf8 = (Utf8) value;
    return Binary.fromReusedByteArray(utf8.getBytes(), 0, utf8.getByteLength());
  } else if (value instanceof CharSequence) {
    return Binary.fromCharSequence((CharSequence) value);
  }
  return Binary.fromCharSequence(value.toString());
}
 
Example 10
Source File: AvroWriteSupportInt96Avro18.java    From datacollector with Apache License 2.0 4 votes vote down vote up
/**
 * Calls an appropriate write method based on the value.
 * Value must not be null and the schema must not be nullable.
 *
 * @param type a Parquet type
 * @param avroSchema a non-nullable Avro schema
 * @param value a non-null value to write
 */
@SuppressWarnings("unchecked")
private void writeValueWithoutConversion(Type type, Schema avroSchema, Object value) {
  switch (avroSchema.getType()) {
    case BOOLEAN:
      recordConsumer.addBoolean((Boolean) value);
      break;
    case INT:
      if (value instanceof Character) {
        recordConsumer.addInteger((Character) value);
      } else {
        recordConsumer.addInteger(((Number) value).intValue());
      }
      break;
    case LONG:
      if (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) {
        final long NANOS_PER_HOUR = TimeUnit.HOURS.toNanos(1);
        final long NANOS_PER_MINUTE = TimeUnit.MINUTES.toNanos(1);
        final long NANOS_PER_SECOND = TimeUnit.SECONDS.toNanos(1);

        long timestamp = ((Number) value).longValue();
        Calendar calendar;
        if (timeZoneId != null && ! timeZoneId.isEmpty()) {
          calendar = Calendar.getInstance(TimeZone.getTimeZone(timeZoneId));
        } else {
          calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
        }
        calendar.setTime(new Date(timestamp));

        // Calculate Julian days and nanoseconds in the day
        LocalDate dt = LocalDate.of(calendar.get(Calendar.YEAR), calendar.get(Calendar.MONTH)+1, calendar.get(Calendar.DAY_OF_MONTH));
        int julianDays = (int) JulianFields.JULIAN_DAY.getFrom(dt);
        long nanos = (calendar.get(Calendar.HOUR_OF_DAY) * NANOS_PER_HOUR)
            + (calendar.get(Calendar.MINUTE) * NANOS_PER_MINUTE)
            + (calendar.get(Calendar.SECOND) * NANOS_PER_SECOND);

        // Write INT96 timestamp
        byte[] timestampBuffer = new byte[12];
        ByteBuffer buf = ByteBuffer.wrap(timestampBuffer);
        buf.order(ByteOrder.LITTLE_ENDIAN).putLong(nanos).putInt(julianDays);

        // This is the properly encoded INT96 timestamp
        Binary timestampBinary = Binary.fromReusedByteArray(timestampBuffer);
        recordConsumer.addBinary(timestampBinary);
      } else {
        recordConsumer.addLong(((Number) value).longValue());
      }
      break;
    case FLOAT:
      recordConsumer.addFloat(((Number) value).floatValue());
      break;
    case DOUBLE:
      recordConsumer.addDouble(((Number) value).doubleValue());
      break;
    case FIXED:
      recordConsumer.addBinary(Binary.fromReusedByteArray(((GenericFixed) value).bytes()));
      break;
    case BYTES:
      if (value instanceof byte[]) {
        recordConsumer.addBinary(Binary.fromReusedByteArray((byte[]) value));
      } else {
        recordConsumer.addBinary(Binary.fromReusedByteBuffer((ByteBuffer) value));
      }
      break;
    case STRING:
      recordConsumer.addBinary(fromAvroString(value));
      break;
    case RECORD:
      writeRecord(type.asGroupType(), avroSchema, value);
      break;
    case ENUM:
      recordConsumer.addBinary(Binary.fromString(value.toString()));
      break;
    case ARRAY:
      listWriter.writeList(type.asGroupType(), avroSchema, value);
      break;
    case MAP:
      writeMap(type.asGroupType(), avroSchema, (Map<CharSequence, ?>) value);
      break;
    case UNION:
      writeUnion(type.asGroupType(), avroSchema, value);
      break;
    default:
      break;
  }
}
 
Example 11
Source File: AvroWriteSupportInt96Avro17.java    From datacollector with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private void writeValue(Type type, Schema avroSchema, Object value) {
  Schema nonNullAvroSchema = AvroSchemaConverter.getNonNull(avroSchema);
  Schema.Type avroType = nonNullAvroSchema.getType();
  if (avroType.equals(Schema.Type.BOOLEAN)) {
    recordConsumer.addBoolean((Boolean) value);
  } else if (avroType.equals(Schema.Type.INT)) {
    if (value instanceof Character) {
      recordConsumer.addInteger((Character) value);
    } else {
      recordConsumer.addInteger(((Number) value).intValue());
    }
  } else if (avroType.equals(Schema.Type.LONG)) {
    if (type.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) {
      final long NANOS_PER_HOUR = TimeUnit.HOURS.toNanos(1);
      final long NANOS_PER_MINUTE = TimeUnit.MINUTES.toNanos(1);
      final long NANOS_PER_SECOND = TimeUnit.SECONDS.toNanos(1);

      long timestamp = ((Number) value).longValue();
      Calendar calendar;
      if (timeZoneId != null && ! timeZoneId.isEmpty()) {
        calendar = Calendar.getInstance(TimeZone.getTimeZone(timeZoneId));
      } else {
        calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
      }
      calendar.setTime(new Date(timestamp));

      // Calculate Julian days and nanoseconds in the day
      LocalDate dt = LocalDate.of(calendar.get(Calendar.YEAR), calendar.get(Calendar.MONTH)+1, calendar.get(Calendar.DAY_OF_MONTH));
      int julianDays = (int) JulianFields.JULIAN_DAY.getFrom(dt);
      long nanos = (calendar.get(Calendar.HOUR_OF_DAY) * NANOS_PER_HOUR)
          + (calendar.get(Calendar.MINUTE) * NANOS_PER_MINUTE)
          + (calendar.get(Calendar.SECOND) * NANOS_PER_SECOND);

      // Write INT96 timestamp
      byte[] timestampBuffer = new byte[12];
      ByteBuffer buf = ByteBuffer.wrap(timestampBuffer);
      buf.order(ByteOrder.LITTLE_ENDIAN).putLong(nanos).putInt(julianDays);

      // This is the properly encoded INT96 timestamp
      Binary timestampBinary = Binary.fromReusedByteArray(timestampBuffer);
      recordConsumer.addBinary(timestampBinary);
    } else {
      recordConsumer.addLong(((Number) value).longValue());
    }
  } else if (avroType.equals(Schema.Type.FLOAT)) {
    recordConsumer.addFloat(((Number) value).floatValue());
  } else if (avroType.equals(Schema.Type.DOUBLE)) {
    recordConsumer.addDouble(((Number) value).doubleValue());
  } else if (avroType.equals(Schema.Type.BYTES)) {
    if (value instanceof byte[]) {
      recordConsumer.addBinary(Binary.fromReusedByteArray((byte[]) value));
    } else {
      recordConsumer.addBinary(Binary.fromReusedByteBuffer((ByteBuffer) value));
    }
  } else if (avroType.equals(Schema.Type.STRING)) {
    recordConsumer.addBinary(fromAvroString(value));
  } else if (avroType.equals(Schema.Type.RECORD)) {
    writeRecord(type.asGroupType(), nonNullAvroSchema, value);
  } else if (avroType.equals(Schema.Type.ENUM)) {
    recordConsumer.addBinary(Binary.fromString(value.toString()));
  } else if (avroType.equals(Schema.Type.ARRAY)) {
    listWriter.writeList(type.asGroupType(), nonNullAvroSchema, value);
  } else if (avroType.equals(Schema.Type.MAP)) {
    writeMap(type.asGroupType(), nonNullAvroSchema, (Map<CharSequence, ?>) value);
  } else if (avroType.equals(Schema.Type.UNION)) {
    writeUnion(type.asGroupType(), nonNullAvroSchema, value);
  } else if (avroType.equals(Schema.Type.FIXED)) {
    recordConsumer.addBinary(Binary.fromReusedByteArray(((GenericFixed) value).bytes()));
  }
}
 
Example 12
Source File: RandomValues.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public Binary asReusedBinary(byte[] data) {
  int length = Math.min(data.length, bufferLength);
  System.arraycopy(data, 0, buffer, 0, length);
  return Binary.fromReusedByteArray(data, 0, length);
}
 
Example 13
Source File: ParquetStringInspector.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public Object set(final Object o, final Text text) {
  return new BinaryWritable(text == null ? null : Binary.fromReusedByteArray(text.getBytes
      ()));
}