Java Code Examples for org.apache.parquet.io.api.Binary

The following examples show how to use org.apache.parquet.io.api.Binary. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Bats   Source File: VarLenEntryDictionaryReader.java    License: Apache License 2.0 6 votes vote down vote up
private final VarLenColumnBulkEntry getEntrySingle(int valsToReadWithinPage) {
  final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader;
  final int[] valueLengths = entry.getValuesLength();
  final Binary currEntry = valueReader.getEntry();
  final int dataLen = currEntry.length();

  // Is there enough memory to handle this large value?
  if (batchMemoryConstraintsReached(0, 4, dataLen)) {
    entry.set(0, 0, 0, 0); // no data to be consumed
    return entry;
  }

  // Set the value length
  valueLengths[0] = dataLen;

  // Now set the bulk entry
  entry.set(0, dataLen, 1, 1, currEntry.getBytes());

  return entry;
}
 
Example 2
Source Project: Bats   Source File: ParquetTableMetadataUtils.java    License: Apache License 2.0 6 votes vote down vote up
private static Long getLong(Object value) {
  if (value instanceof Integer) {
    return Long.valueOf((Integer) value);
  } else if (value instanceof Long) {
    return (Long) value;
  } else if (value instanceof Float) {
    return ((Float) value).longValue();
  } else if (value instanceof Double) {
    return ((Double) value).longValue();
  } else if (value instanceof String) {
    return Long.parseLong(value.toString());
  } else if (value instanceof byte[]) {
    return new BigInteger((byte[]) value).longValue();
  } else if (value instanceof Binary) {
    return new BigInteger(((Binary) value).getBytes()).longValue();
  }
  throw new UnsupportedOperationException(String.format("Cannot obtain Integer using value %s", value));
}
 
Example 3
Source Project: parquet-mr   Source File: ProtoWriteSupportTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testOptionalInnerMessage() throws Exception {
  RecordConsumer readConsumerMock =  Mockito.mock(RecordConsumer.class);
  ProtoWriteSupport instance = createReadConsumerInstance(TestProtobuf.MessageA.class, readConsumerMock);

  TestProtobuf.MessageA.Builder msg = TestProtobuf.MessageA.newBuilder();
  msg.getInnerBuilder().setOne("one");

  instance.write(msg.build());

  InOrder inOrder = Mockito.inOrder(readConsumerMock);

  inOrder.verify(readConsumerMock).startMessage();
  inOrder.verify(readConsumerMock).startField("inner", 0);

  inOrder.verify(readConsumerMock).startGroup();
  inOrder.verify(readConsumerMock).startField("one", 0);
  inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("one".getBytes()));
  inOrder.verify(readConsumerMock).endField("one", 0);
  inOrder.verify(readConsumerMock).endGroup();

  inOrder.verify(readConsumerMock).endField("inner", 0);
  inOrder.verify(readConsumerMock).endMessage();
  Mockito.verifyNoMoreInteractions(readConsumerMock);
}
 
Example 4
Source Project: parquet-mr   Source File: TestParquetMetadataConverter.java    License: Apache License 2.0 6 votes vote down vote up
private void testBinaryStatsWithTruncation(int truncateLen, int minLen, int maxLen) {
  BinaryStatistics stats = new BinaryStatistics();
  byte[] min = generateRandomString("a", minLen).getBytes();
  byte[] max = generateRandomString("b", maxLen).getBytes();
  stats.updateStats(Binary.fromConstantByteArray(min));
  stats.updateStats(Binary.fromConstantByteArray(max));
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(truncateLen);
  org.apache.parquet.format.Statistics formatStats = metadataConverter.toParquetStatistics(stats);

  if (minLen + maxLen >= ParquetMetadataConverter.MAX_STATS_SIZE) {
    assertNull(formatStats.getMin_value());
    assertNull(formatStats.getMax_value());
  } else {
    String minString = new String(min, Charset.forName("UTF-8"));
    String minStatString = new String(formatStats.getMin_value(), Charset.forName("UTF-8"));
    assertTrue(minStatString.compareTo(minString) <= 0);
    String maxString = new String(max, Charset.forName("UTF-8"));
    String maxStatString = new String(formatStats.getMax_value(), Charset.forName("UTF-8"));
    assertTrue(maxStatString.compareTo(maxString) >= 0);
  }
}
 
Example 5
Source Project: iceberg   Source File: ParquetValueWriters.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void write(int repetitionLevel, BigDecimal decimal) {
  Preconditions.checkArgument(decimal.scale() == scale,
      "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal);
  Preconditions.checkArgument(decimal.precision() <= precision,
      "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal);

  byte fillByte = (byte) (decimal.signum() < 0 ? 0xFF : 0x00);
  byte[] unscaled = decimal.unscaledValue().toByteArray();
  byte[] buf = bytes.get();
  int offset = length - unscaled.length;

  for (int i = 0; i < length; i += 1) {
    if (i < offset) {
      buf[i] = fillByte;
    } else {
      buf[i] = unscaled[i - offset];
    }
  }

  column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(buf));
}
 
Example 6
Source Project: parquet-mr   Source File: FixedBinaryTestUtils.java    License: Apache License 2.0 6 votes vote down vote up
public static Binary getFixedBinary(int length, BigInteger bigInt) {
  byte[] array = bigInt.toByteArray();
  if (array.length == length) {
    return Binary.fromConstantByteArray(array);
  } else if (array.length < length) {
    byte[] padded = new byte[length];
    int paddingLength = length - array.length;
    if (bigInt.signum() < 0) {
      Arrays.fill(padded, 0, paddingLength, (byte) 0xFF);
    } else {
      Arrays.fill(padded, 0, paddingLength, (byte) 0x00);
    }
    System.arraycopy(array, 0, padded, paddingLength, array.length);
    return Binary.fromConstantByteArray(padded);
  } else {
    throw new IllegalArgumentException(
        "Specified BigInteger (" + bigInt + ") is too long for fixed bytes (" + array.length + '>' + length + ')');
  }
}
 
Example 7
Source Project: iceberg   Source File: ParquetFilters.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static <C extends Comparable<C>> C getParquetPrimitive(Literal<?> lit) {
  if (lit == null) {
    return null;
  }

  // TODO: this needs to convert to handle BigDecimal and UUID
  Object value = lit.value();
  if (value instanceof Number) {
    return (C) lit.value();
  } else if (value instanceof CharSequence) {
    return (C) Binary.fromString(value.toString());
  } else if (value instanceof ByteBuffer) {
    return (C) Binary.fromReusedByteBuffer((ByteBuffer) value);
  }
  throw new UnsupportedOperationException(
      "Type not supported yet: " + value.getClass().getName());
}
 
Example 8
Source Project: pentaho-hadoop-shims   Source File: ParquetConverter.java    License: Apache License 2.0 6 votes vote down vote up
private static long dateFromInt96( Binary value ) {
  byte[] readBuffer = value.getBytes();
  if ( readBuffer.length != 12 ) {
    throw new RuntimeException( "Invalid byte array length for INT96" );
  }

  long timeOfDayNanos =
    ( ( (long) readBuffer[ 7 ] << 56 ) + ( (long) ( readBuffer[ 6 ] & 255 ) << 48 )
      + ( (long) ( readBuffer[ 5 ] & 255 ) << 40 ) + ( (long) ( readBuffer[ 4 ] & 255 ) << 32 )
      + ( (long) ( readBuffer[ 3 ] & 255 ) << 24 ) + ( ( readBuffer[ 2 ] & 255 ) << 16 )
      + ( ( readBuffer[ 1 ] & 255 ) << 8 ) + ( readBuffer[ 0 ] & 255 ) );

  int julianDay =
    ( (int) ( readBuffer[ 11 ] & 255 ) << 24 ) + ( ( readBuffer[ 10 ] & 255 ) << 16 )
      + ( ( readBuffer[ 9 ] & 255 ) << 8 ) + ( readBuffer[ 8 ] & 255 );

  return ( julianDay - ParquetSpec.JULIAN_DAY_OF_EPOCH ) * 24L * 60L * 60L * 1000L + timeOfDayNanos / 1000000;
}
 
Example 9
Source Project: parquet-mr   Source File: DictionaryValuesWriter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void writeBytes(Binary value) {
  int id = binaryDictionaryContent.getInt(value);
  if (id == -1) {
    id = binaryDictionaryContent.size();
    binaryDictionaryContent.put(value.copy(), id);
    dictionaryByteSize += length;
  }
  encodedValues.add(id);
}
 
Example 10
Source Project: parquet-mr   Source File: TestDeltaLengthByteArray.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSerialization () throws IOException {
  DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter();
  DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();
  
  Utils.writeData(writer, values);
  Binary[] bin = Utils.readData(reader, writer.getBytes().toInputStream(), values.length);

  for(int i =0; i< bin.length ; i++) {
    Assert.assertEquals(Binary.fromString(values[i]), bin[i]);
  }
}
 
Example 11
Source Project: parquet-mr   Source File: AvroIndexedRecordConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
final public void addBinary(Binary value) {
  Object enumValue = value.toStringUsingUTF8();
  if (enumClass != null) {
    enumValue = (Enum.valueOf(enumClass,(String)enumValue));
  }
  parent.add(enumValue);
}
 
Example 12
Source Project: parquet-mr   Source File: ParquetFileTest.java    License: Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}
 
Example 13
Source Project: iceberg   Source File: ParquetConversions.java    License: Apache License 2.0 5 votes vote down vote up
static <T> Literal<T> fromParquetPrimitive(Type type, Object value) {
  if (value instanceof Boolean) {
    return Literal.of((Boolean) value).to(type);
  } else if (value instanceof Integer) {
    return Literal.of((Integer) value).to(type);
  } else if (value instanceof Long) {
    return Literal.of((Long) value).to(type);
  } else if (value instanceof Float) {
    return Literal.of((Float) value).to(type);
  } else if (value instanceof Double) {
    return Literal.of((Double) value).to(type);
  } else if (value instanceof Binary) {
    switch (type.typeId()) {
      case STRING:
        return Literal.of(Charsets.UTF_8.decode(((Binary) value).toByteBuffer())).to(type);
      case UUID:
        ByteBuffer buffer = ((Binary) value).toByteBuffer().order(ByteOrder.BIG_ENDIAN);
        long mostSigBits = buffer.getLong();
        long leastSigBits = buffer.getLong();
        return Literal.of(new UUID(mostSigBits, leastSigBits)).to(type);
      case FIXED:
      case BINARY:
        return Literal.of(((Binary) value).toByteBuffer()).to(type);
      case DECIMAL:
        Types.DecimalType decimal = (Types.DecimalType) type;
        return Literal.of(
            new BigDecimal(new BigInteger(((Binary) value).getBytes()), decimal.scale())
        ).to(type);
      default:
        throw new IllegalArgumentException("Unsupported primitive type: " + type);
    }
  } else {
    throw new IllegalArgumentException("Unsupported primitive value: " + value);
  }
}
 
Example 14
Source Project: parquet-mr   Source File: TestBinaryTruncator.java    License: Apache License 2.0 5 votes vote down vote up
private void testTruncator(PrimitiveType type, boolean strict) {
  BinaryTruncator truncator = BinaryTruncator.getTruncator(type);
  Comparator<Binary> comparator = type.comparator();

  checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa"), strict, strict);
  checkContract(truncator, comparator, Binary.fromString("árvíztűrő tükörfúrógép"), strict, strict);
  checkContract(truncator, comparator, Binary.fromString("aaaaaaaaaa" + UTF8_3BYTES_MAX_CHAR), strict, strict);
  checkContract(truncator, comparator, Binary.fromString("a" + UTF8_3BYTES_MAX_CHAR + UTF8_1BYTE_MAX_CHAR), strict,
      strict);

  checkContract(truncator, comparator,
      Binary.fromConstantByteArray(new byte[] { (byte) 0xFE, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, (byte) 0xFF }), strict,
      strict);

  // Edge case: zero length -> unable to truncate
  checkContract(truncator, comparator, Binary.fromString(""), false, false);
  // Edge case: containing only UTF-8 max characters -> unable to truncate for max
  checkContract(truncator, comparator, Binary.fromString(
      UTF8_1BYTE_MAX_CHAR +
          UTF8_4BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_4BYTES_MAX_CHAR +
          UTF8_2BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_1BYTE_MAX_CHAR +
          UTF8_2BYTES_MAX_CHAR +
          UTF8_3BYTES_MAX_CHAR +
          UTF8_4BYTES_MAX_CHAR),
      strict, false);
  // Edge case: non-UTF-8; max bytes -> unable to truncate for max
  checkContract(
      truncator, comparator,
      binary(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF),
      strict, false);
}
 
Example 15
Source Project: Bats   Source File: DrillParquetGroupConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
  final byte[] input = value.getBytes();
  holder.months = ParquetReaderUtility.getIntFromLEBytes(input, 0);
  holder.days = ParquetReaderUtility.getIntFromLEBytes(input, 4);
  holder.milliseconds = ParquetReaderUtility.getIntFromLEBytes(input, 8);
  writer.write(holder);
}
 
Example 16
Source Project: parquet-mr   Source File: SimpleGroup.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void add(int fieldIndex, Binary value) {
  switch (getType().getType(fieldIndex).asPrimitiveType().getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
      add(fieldIndex, new BinaryValue(value));
      break;
    case INT96:
      add(fieldIndex, new Int96Value(value));
      break;
    default:
      throw new UnsupportedOperationException(
          getType().asPrimitiveType().getName() + " not supported for Binary");
  }
}
 
Example 17
Source Project: parquet-mr   Source File: ProtoWriteSupportTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testRepeatedInnerMessageMessage_scalar() throws Exception {
  RecordConsumer readConsumerMock =  Mockito.mock(RecordConsumer.class);
  ProtoWriteSupport instance = createReadConsumerInstance(TestProtobuf.TopMessage.class, readConsumerMock);

  TestProtobuf.TopMessage.Builder msg = TestProtobuf.TopMessage.newBuilder();
  msg.addInnerBuilder().setOne("one");
  msg.addInnerBuilder().setTwo("two");

  instance.write(msg.build());

  InOrder inOrder = Mockito.inOrder(readConsumerMock);

  inOrder.verify(readConsumerMock).startMessage();
  inOrder.verify(readConsumerMock).startField("inner", 0);

  //first inner message
  inOrder.verify(readConsumerMock).startGroup();
  inOrder.verify(readConsumerMock).startField("one", 0);
  inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("one".getBytes()));
  inOrder.verify(readConsumerMock).endField("one", 0);
  inOrder.verify(readConsumerMock).endGroup();

  //second inner message
  inOrder.verify(readConsumerMock).startGroup();
  inOrder.verify(readConsumerMock).startField("two", 1);
  inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("two".getBytes()));
  inOrder.verify(readConsumerMock).endField("two", 1);
  inOrder.verify(readConsumerMock).endGroup();

  inOrder.verify(readConsumerMock).endField("inner", 0);
  inOrder.verify(readConsumerMock).endMessage();
  Mockito.verifyNoMoreInteractions(readConsumerMock);
}
 
Example 18
Source Project: parquet-mr   Source File: DictionaryFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGtEqMissingColumn() throws Exception {
  BinaryColumn b = binaryColumn("missing_column");

  assertTrue("Should drop block for any non-null query",
      canDrop(gtEq(b, Binary.fromString("any")), ccmd, dictionaries));
}
 
Example 19
Source Project: parquet-mr   Source File: ProtoWriteSupportTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testProto3OptionalInnerMessage() throws Exception {
  RecordConsumer readConsumerMock =  Mockito.mock(RecordConsumer.class);
  ProtoWriteSupport instance = createReadConsumerInstance(TestProto3.MessageA.class, readConsumerMock);

  TestProto3.MessageA.Builder msg = TestProto3.MessageA.newBuilder();
  msg.getInnerBuilder().setOne("one");

  instance.write(msg.build());

  InOrder inOrder = Mockito.inOrder(readConsumerMock);

  inOrder.verify(readConsumerMock).startMessage();
  inOrder.verify(readConsumerMock).startField("inner", 0);

  inOrder.verify(readConsumerMock).startGroup();
  inOrder.verify(readConsumerMock).startField("one", 0);
  inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("one".getBytes()));
  inOrder.verify(readConsumerMock).endField("one", 0);
  inOrder.verify(readConsumerMock).startField("two", 1);
  inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("".getBytes()));
  inOrder.verify(readConsumerMock).endField("two", 1);
  inOrder.verify(readConsumerMock).startField("three", 2);
  inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("".getBytes()));
  inOrder.verify(readConsumerMock).endField("three", 2);
  inOrder.verify(readConsumerMock).endGroup();

  inOrder.verify(readConsumerMock).endField("inner", 0);
  inOrder.verify(readConsumerMock).endMessage();
  Mockito.verifyNoMoreInteractions(readConsumerMock);
}
 
Example 20
Source Project: embulk-output-parquet   Source File: EmbulkWriteSupport.java    License: MIT License 5 votes vote down vote up
@Override
public void stringColumn(Column column)
{
    if (!record.isNull(column)) {
        consumer.addBinary(Binary.fromString(record.getString(column)));
    }
}
 
Example 21
Source Project: presto   Source File: LongDecimalColumnReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected void readValue(BlockBuilder blockBuilder, Type prestoType)
{
    if (!(prestoType instanceof DecimalType)) {
        throw new ParquetDecodingException(format("Unsupported Presto column type (%s) for Parquet column (%s)", prestoType, columnDescriptor));
    }

    DecimalType prestoDecimalType = (DecimalType) prestoType;

    if (definitionLevel == columnDescriptor.getMaxDefinitionLevel()) {
        Binary binary = valuesReader.readBytes();
        Slice value = Decimals.encodeUnscaledValue(new BigInteger(binary.getBytes()));

        if (prestoDecimalType.isShort()) {
            prestoType.writeLong(blockBuilder, longToShortCast(
                    value,
                    parquetDecimalType.getPrecision(),
                    parquetDecimalType.getScale(),
                    prestoDecimalType.getPrecision(),
                    prestoDecimalType.getScale()));
        }
        else {
            prestoType.writeSlice(blockBuilder, longToLongCast(
                    value,
                    parquetDecimalType.getPrecision(),
                    parquetDecimalType.getScale(),
                    prestoDecimalType.getPrecision(),
                    prestoDecimalType.getScale()));
        }
    }
    else if (isValueNull()) {
        blockBuilder.appendNull();
    }
}
 
Example 22
Source Project: parquet-mr   Source File: TestColumnIndexFiltering.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSimpleFiltering() throws IOException {
  assertCorrectFiltering(
      record -> record.getId() == 1234,
      eq(longColumn("id"), 1234l));
  assertCorrectFiltering(
      record -> "miller".equals(record.getName()),
      eq(binaryColumn("name"), Binary.fromString("miller")));
  assertCorrectFiltering(
      record -> record.getName() == null,
      eq(binaryColumn("name"), null));
}
 
Example 23
Source Project: parquet-mr   Source File: DictionaryValuesWriter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public DictionaryPage toDictPageAndClose() {
  if (lastUsedDictionarySize > 0) {
    // return a dictionary only if we actually used it
    FixedLenByteArrayPlainValuesWriter dictionaryEncoder = new FixedLenByteArrayPlainValuesWriter(length, lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator);
    Iterator<Binary> binaryIterator = binaryDictionaryContent.keySet().iterator();
    // write only the part of the dict that we used
    for (int i = 0; i < lastUsedDictionarySize; i++) {
      Binary entry = binaryIterator.next();
      dictionaryEncoder.writeBytes(entry);
    }
    return dictPage(dictionaryEncoder);
  }
  return null;
}
 
Example 24
Source Project: flink   Source File: RowConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void addBinary(Binary value) {
	// in case it is a timestamp type stored as INT96
	if (primitiveTypeName.equals(PrimitiveType.PrimitiveTypeName.INT96)) {
		parentDataHolder.add(pos, new Timestamp(ParquetTimestampUtils.getTimestampMillis(value)));
		return;
	}

	if (originalType != null) {
		switch (originalType) {
			case DECIMAL:
				parentDataHolder.add(pos, new BigDecimal(value.toStringUsingUTF8().toCharArray()));
				break;
			case UTF8:
			case ENUM:
			case JSON:
			case BSON:
				parentDataHolder.add(pos, value.toStringUsingUTF8());
				break;
			default:
				throw new UnsupportedOperationException("Unsupported original type : " + originalType.name()
					+ " for primitive type BINARY");
		}
	} else {
		parentDataHolder.add(pos, value.toStringUsingUTF8());
	}
}
 
Example 25
Source Project: flink   Source File: ParquetTableSource.java    License: Apache License 2.0 5 votes vote down vote up
@Nullable
private Tuple2<Column, Comparable> extractColumnAndLiteral(BinaryComparison comp) {
	TypeInformation<?> typeInfo = getLiteralType(comp);
	String columnName = getColumnName(comp);

	// fetch literal and ensure it is comparable
	Object value = getLiteral(comp);
	// validate that literal is comparable
	if (!(value instanceof Comparable)) {
		LOG.warn("Encountered a non-comparable literal of type {}." +
			"Cannot push predicate [{}] into ParquetTablesource." +
			"This is a bug and should be reported.", value.getClass().getCanonicalName(), comp);
		return null;
	}

	if (typeInfo == BasicTypeInfo.BYTE_TYPE_INFO ||
		typeInfo == BasicTypeInfo.SHORT_TYPE_INFO ||
		typeInfo == BasicTypeInfo.INT_TYPE_INFO) {
		return new Tuple2<>(FilterApi.intColumn(columnName), (Integer) value);
	} else if (typeInfo == BasicTypeInfo.LONG_TYPE_INFO) {
		return new Tuple2<>(FilterApi.longColumn(columnName), (Long) value);
	} else if (typeInfo == BasicTypeInfo.FLOAT_TYPE_INFO) {
		return new Tuple2<>(FilterApi.floatColumn(columnName), (Float) value);
	} else if (typeInfo == BasicTypeInfo.BOOLEAN_TYPE_INFO) {
		return new Tuple2<>(FilterApi.booleanColumn(columnName), (Boolean) value);
	} else if (typeInfo == BasicTypeInfo.DOUBLE_TYPE_INFO) {
		return new Tuple2<>(FilterApi.doubleColumn(columnName), (Double) value);
	} else if (typeInfo == BasicTypeInfo.STRING_TYPE_INFO) {
		return new Tuple2<>(FilterApi.binaryColumn(columnName), Binary.fromString((String) value));
	} else {
		// unsupported type
		return null;
	}
}
 
Example 26
Source Project: parquet-mr   Source File: TestDeltaLengthByteArray.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testRandomStrings() throws IOException {
  DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter();
  DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();

  String[] values = Utils.getRandomStringSamples(1000, 32);
  Utils.writeData(writer, values);
  Binary[] bin = Utils.readData(reader, writer.getBytes().toInputStream(), values.length);

  for(int i =0; i< bin.length ; i++) {
    Assert.assertEquals(Binary.fromString(values[i]), bin[i]);
  }
}
 
Example 27
Source Project: parquet-mr   Source File: TestColumnIndexes.java    License: Apache License 2.0 5 votes vote down vote up
private Group createGroup(List<Supplier<?>> generators, Random random) {
  Group group = FACTORY.newGroup();
  for (int column = 0, columnCnt = SCHEMA.getFieldCount(); column < columnCnt; ++column) {
    Type type = SCHEMA.getType(column);
    Supplier<?> generator = generators.get(column);
    // 2% chance of null value for an optional column
    if (generator == null || (type.isRepetition(OPTIONAL) && random.nextInt(50) == 0)) {
      continue;
    }
    switch (type.asPrimitiveType().getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
    case INT96:
      group.append(type.getName(), (Binary) generator.get());
      break;
    case INT32:
      group.append(type.getName(), (Integer) generator.get());
      break;
    case INT64:
      group.append(type.getName(), (Long) generator.get());
      break;
    case FLOAT:
      group.append(type.getName(), (Float) generator.get());
      break;
    case DOUBLE:
      group.append(type.getName(), (Double) generator.get());
      break;
    case BOOLEAN:
      group.append(type.getName(), (Boolean) generator.get());
      break;
    }
  }
  return group;
}
 
Example 28
Source Project: iceberg   Source File: ParquetMetricsRowGroupFilter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) {
  int id = ref.fieldId();

  Long valueCount = valueCounts.get(id);
  if (valueCount == null) {
    // the column is not present and is all nulls
    return ROWS_CANNOT_MATCH;
  }

  Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id);
  if (colStats != null && !colStats.isEmpty()) {
    if (!colStats.hasNonNullValue()) {
      return ROWS_CANNOT_MATCH;
    }

    ByteBuffer prefixAsBytes = lit.toByteBuffer();

    Comparator<ByteBuffer> comparator = Comparators.unsignedBytes();

    Binary lower = colStats.genericGetMin();
    // truncate lower bound so that its length in bytes is not greater than the length of prefix
    int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length());
    int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes);
    if (lowerCmp > 0) {
      return ROWS_CANNOT_MATCH;
    }

    Binary upper = colStats.genericGetMax();
    // truncate upper bound so that its length in bytes is not greater than the length of prefix
    int upperLength = Math.min(prefixAsBytes.remaining(), upper.length());
    int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes);
    if (upperCmp < 0) {
      return ROWS_CANNOT_MATCH;
    }
  }

  return ROWS_MIGHT_MATCH;
}
 
Example 29
Source Project: parquet-mr   Source File: DictionaryFilterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testEqBinary() throws Exception {
  BinaryColumn b = binaryColumn("binary_field");
  FilterPredicate pred = eq(b, Binary.fromString("c"));

  assertFalse("Should not drop block for lower case letters",
      canDrop(pred, ccmd, dictionaries));

  assertTrue("Should drop block for upper case letters",
      canDrop(eq(b, Binary.fromString("A")), ccmd, dictionaries));

  assertFalse("Should not drop block for null",
      canDrop(eq(b, null), ccmd, dictionaries));
}
 
Example 30
Source Project: parquet-mr   Source File: TestPrimitiveStringifier.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testUTF8Stringifier() {
  PrimitiveStringifier stringifier = UTF8_STRINGIFIER;

  assertEquals("null", stringifier.stringify(null));
  assertEquals("", stringifier.stringify(Binary.EMPTY));
  assertEquals("This is a UTF-8 test", stringifier.stringify(Binary.fromString("This is a UTF-8 test")));
  assertEquals("これはUTF-8のテストです",
      stringifier.stringify(Binary.fromConstantByteArray("これはUTF-8のテストです".getBytes(UTF_8))));

  checkThrowingUnsupportedException(stringifier, Binary.class);
}