org.apache.parquet.io.api.Binary Java Examples
The following examples show how to use
org.apache.parquet.io.api.Binary.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetConverter.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
private static long dateFromInt96( Binary value ) { byte[] readBuffer = value.getBytes(); if ( readBuffer.length != 12 ) { throw new RuntimeException( "Invalid byte array length for INT96" ); } long timeOfDayNanos = ( ( (long) readBuffer[ 7 ] << 56 ) + ( (long) ( readBuffer[ 6 ] & 255 ) << 48 ) + ( (long) ( readBuffer[ 5 ] & 255 ) << 40 ) + ( (long) ( readBuffer[ 4 ] & 255 ) << 32 ) + ( (long) ( readBuffer[ 3 ] & 255 ) << 24 ) + ( ( readBuffer[ 2 ] & 255 ) << 16 ) + ( ( readBuffer[ 1 ] & 255 ) << 8 ) + ( readBuffer[ 0 ] & 255 ) ); int julianDay = ( (int) ( readBuffer[ 11 ] & 255 ) << 24 ) + ( ( readBuffer[ 10 ] & 255 ) << 16 ) + ( ( readBuffer[ 9 ] & 255 ) << 8 ) + ( readBuffer[ 8 ] & 255 ); return ( julianDay - ParquetSpec.JULIAN_DAY_OF_EPOCH ) * 24L * 60L * 60L * 1000L + timeOfDayNanos / 1000000; }
Example #2
Source File: ParquetValueWriters.java From iceberg with Apache License 2.0 | 6 votes |
@Override public void write(int repetitionLevel, BigDecimal decimal) { Preconditions.checkArgument(decimal.scale() == scale, "Cannot write value as decimal(%s,%s), wrong scale: %s", precision, scale, decimal); Preconditions.checkArgument(decimal.precision() <= precision, "Cannot write value as decimal(%s,%s), too large: %s", precision, scale, decimal); byte fillByte = (byte) (decimal.signum() < 0 ? 0xFF : 0x00); byte[] unscaled = decimal.unscaledValue().toByteArray(); byte[] buf = bytes.get(); int offset = length - unscaled.length; for (int i = 0; i < length; i += 1) { if (i < offset) { buf[i] = fillByte; } else { buf[i] = unscaled[i - offset]; } } column.writeBinary(repetitionLevel, Binary.fromReusedByteArray(buf)); }
Example #3
Source File: FixedBinaryTestUtils.java From parquet-mr with Apache License 2.0 | 6 votes |
public static Binary getFixedBinary(int length, BigInteger bigInt) { byte[] array = bigInt.toByteArray(); if (array.length == length) { return Binary.fromConstantByteArray(array); } else if (array.length < length) { byte[] padded = new byte[length]; int paddingLength = length - array.length; if (bigInt.signum() < 0) { Arrays.fill(padded, 0, paddingLength, (byte) 0xFF); } else { Arrays.fill(padded, 0, paddingLength, (byte) 0x00); } System.arraycopy(array, 0, padded, paddingLength, array.length); return Binary.fromConstantByteArray(padded); } else { throw new IllegalArgumentException( "Specified BigInteger (" + bigInt + ") is too long for fixed bytes (" + array.length + '>' + length + ')'); } }
Example #4
Source File: ProtoWriteSupportTest.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testOptionalInnerMessage() throws Exception { RecordConsumer readConsumerMock = Mockito.mock(RecordConsumer.class); ProtoWriteSupport instance = createReadConsumerInstance(TestProtobuf.MessageA.class, readConsumerMock); TestProtobuf.MessageA.Builder msg = TestProtobuf.MessageA.newBuilder(); msg.getInnerBuilder().setOne("one"); instance.write(msg.build()); InOrder inOrder = Mockito.inOrder(readConsumerMock); inOrder.verify(readConsumerMock).startMessage(); inOrder.verify(readConsumerMock).startField("inner", 0); inOrder.verify(readConsumerMock).startGroup(); inOrder.verify(readConsumerMock).startField("one", 0); inOrder.verify(readConsumerMock).addBinary(Binary.fromConstantByteArray("one".getBytes())); inOrder.verify(readConsumerMock).endField("one", 0); inOrder.verify(readConsumerMock).endGroup(); inOrder.verify(readConsumerMock).endField("inner", 0); inOrder.verify(readConsumerMock).endMessage(); Mockito.verifyNoMoreInteractions(readConsumerMock); }
Example #5
Source File: ParquetFilters.java From iceberg with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") private static <C extends Comparable<C>> C getParquetPrimitive(Literal<?> lit) { if (lit == null) { return null; } // TODO: this needs to convert to handle BigDecimal and UUID Object value = lit.value(); if (value instanceof Number) { return (C) lit.value(); } else if (value instanceof CharSequence) { return (C) Binary.fromString(value.toString()); } else if (value instanceof ByteBuffer) { return (C) Binary.fromReusedByteBuffer((ByteBuffer) value); } throw new UnsupportedOperationException( "Type not supported yet: " + value.getClass().getName()); }
Example #6
Source File: VarLenEntryDictionaryReader.java From Bats with Apache License 2.0 | 6 votes |
private final VarLenColumnBulkEntry getEntrySingle(int valsToReadWithinPage) { final DictionaryReaderWrapper valueReader = pageInfo.dictionaryValueReader; final int[] valueLengths = entry.getValuesLength(); final Binary currEntry = valueReader.getEntry(); final int dataLen = currEntry.length(); // Is there enough memory to handle this large value? if (batchMemoryConstraintsReached(0, 4, dataLen)) { entry.set(0, 0, 0, 0); // no data to be consumed return entry; } // Set the value length valueLengths[0] = dataLen; // Now set the bulk entry entry.set(0, dataLen, 1, 1, currEntry.getBytes()); return entry; }
Example #7
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void testBinaryStatsWithTruncation(int truncateLen, int minLen, int maxLen) { BinaryStatistics stats = new BinaryStatistics(); byte[] min = generateRandomString("a", minLen).getBytes(); byte[] max = generateRandomString("b", maxLen).getBytes(); stats.updateStats(Binary.fromConstantByteArray(min)); stats.updateStats(Binary.fromConstantByteArray(max)); ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(truncateLen); org.apache.parquet.format.Statistics formatStats = metadataConverter.toParquetStatistics(stats); if (minLen + maxLen >= ParquetMetadataConverter.MAX_STATS_SIZE) { assertNull(formatStats.getMin_value()); assertNull(formatStats.getMax_value()); } else { String minString = new String(min, Charset.forName("UTF-8")); String minStatString = new String(formatStats.getMin_value(), Charset.forName("UTF-8")); assertTrue(minStatString.compareTo(minString) <= 0); String maxString = new String(max, Charset.forName("UTF-8")); String maxStatString = new String(formatStats.getMax_value(), Charset.forName("UTF-8")); assertTrue(maxStatString.compareTo(maxString) >= 0); } }
Example #8
Source File: ParquetTableMetadataUtils.java From Bats with Apache License 2.0 | 6 votes |
private static Long getLong(Object value) { if (value instanceof Integer) { return Long.valueOf((Integer) value); } else if (value instanceof Long) { return (Long) value; } else if (value instanceof Float) { return ((Float) value).longValue(); } else if (value instanceof Double) { return ((Double) value).longValue(); } else if (value instanceof String) { return Long.parseLong(value.toString()); } else if (value instanceof byte[]) { return new BigInteger((byte[]) value).longValue(); } else if (value instanceof Binary) { return new BigInteger(((Binary) value).getBytes()).longValue(); } throw new UnsupportedOperationException(String.format("Cannot obtain Integer using value %s", value)); }
Example #9
Source File: SparkParquetReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override public UTF8String read(UTF8String ignored) { Binary binary = column.nextBinary(); ByteBuffer buffer = binary.toByteBuffer(); if (buffer.hasArray()) { return UTF8String.fromBytes( buffer.array(), buffer.arrayOffset() + buffer.position(), buffer.remaining()); } else { return UTF8String.fromBytes(binary.getBytes()); } }
Example #10
Source File: DeltaByteArrayWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void writeBytes(Binary v) { int i = 0; byte[] vb = v.getBytes(); int length = previous.length < vb.length ? previous.length : vb.length; // find the number of matching prefix bytes between this value and the previous one for(i = 0; (i < length) && (previous[i] == vb[i]); i++); prefixLengthWriter.writeInteger(i); suffixWriter.writeBytes(v.slice(i, vb.length - i)); previous = vb; }
Example #11
Source File: TestDeltaByteArray.java From parquet-mr with Apache License 2.0 | 5 votes |
private void assertReadWriteWithSkip(DeltaByteArrayWriter writer, DeltaByteArrayReader reader, String[] vals) throws Exception { Utils.writeData(writer, vals); reader.initFromPage(vals.length, writer.getBytes().toInputStream()); for (int i = 0; i < vals.length; i += 2) { Assert.assertEquals(Binary.fromString(vals[i]), reader.readBytes()); reader.skip(); } }
Example #12
Source File: TestCorruptDeltaByteArrays.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testOldReassemblyWithoutCorruption() throws Exception { DeltaByteArrayWriter writer = getDeltaByteArrayWriter(); for (int i = 0; i < 10; i += 1) { writer.writeBytes(Binary.fromString(str(i))); } ByteBuffer firstPageBytes = writer.getBytes().toByteBuffer(); writer.reset(); // sets previous to new byte[0] for (int i = 10; i < 20; i += 1) { writer.writeBytes(Binary.fromString(str(i))); } ByteBuffer secondPageBytes = writer.getBytes().toByteBuffer(); DeltaByteArrayReader firstPageReader = new DeltaByteArrayReader(); firstPageReader.initFromPage(10, ByteBufferInputStream.wrap(firstPageBytes)); for (int i = 0; i < 10; i += 1) { assertEquals(firstPageReader.readBytes().toStringUsingUTF8(), str(i)); } DeltaByteArrayReader secondPageReader = new DeltaByteArrayReader(); secondPageReader.initFromPage(10, ByteBufferInputStream.wrap(secondPageBytes)); for (int i = 10; i < 20; i += 1) { assertEquals(secondPageReader.readBytes().toStringUsingUTF8(), str(i)); } }
Example #13
Source File: MessageColumnIO.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { if (DEBUG) log("addBinary({} bytes)", value.length()); emptyField = false; getColumnWriter().write(value, r[currentLevel], currentColumnIO.getDefinitionLevel()); setRepetitionLevel(); if (DEBUG) printState(); }
Example #14
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
public FieldEnumConverter(List<TProtocol> events, ThriftField field) { this.events = events; this.field = field; final Iterable<EnumValue> values = ((EnumType)field.getType()).getValues(); for (EnumValue enumValue : values) { enumLookup.put(Binary.fromString(enumValue.getName()), enumValue.getId()); } }
Example #15
Source File: NanoTime.java From parquet-mr with Apache License 2.0 | 5 votes |
public Binary toBinary() { ByteBuffer buf = ByteBuffer.allocate(12); buf.order(ByteOrder.LITTLE_ENDIAN); buf.putLong(timeOfDayNanos); buf.putInt(julianDay); buf.flip(); return Binary.fromConstantByteBuffer(buf); }
Example #16
Source File: TestPrimitiveStringifier.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testUTF8Stringifier() { PrimitiveStringifier stringifier = UTF8_STRINGIFIER; assertEquals("null", stringifier.stringify(null)); assertEquals("", stringifier.stringify(Binary.EMPTY)); assertEquals("This is a UTF-8 test", stringifier.stringify(Binary.fromString("This is a UTF-8 test"))); assertEquals("これはUTF-8のテストです", stringifier.stringify(Binary.fromConstantByteArray("これはUTF-8のテストです".getBytes(UTF_8)))); checkThrowingUnsupportedException(stringifier, Binary.class); }
Example #17
Source File: ColumnPredicates.java From parquet-mr with Apache License 2.0 | 5 votes |
public static Predicate applyFunctionToBinary (final PredicateFunction<Binary> fn) { return new Predicate() { @Override public boolean apply(ColumnReader input) { return fn.functionToApply(input.getBinary()); } }; }
Example #18
Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testEqBinary() throws Exception { BinaryColumn b = binaryColumn("binary_field"); FilterPredicate pred = eq(b, Binary.fromString("c")); assertFalse("Should not drop block for lower case letters", canDrop(pred, ccmd, dictionaries)); assertTrue("Should drop block for upper case letters", canDrop(eq(b, Binary.fromString("A")), ccmd, dictionaries)); assertFalse("Should not drop block for null", canDrop(eq(b, null), ccmd, dictionaries)); }
Example #19
Source File: EmbulkWriteSupport.java From embulk-output-parquet with MIT License | 5 votes |
@Override public void stringColumn(Column column) { if (!record.isNull(column)) { consumer.addBinary(Binary.fromString(record.getString(column))); } }
Example #20
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBinaryDictionaryFallBack() throws IOException { int slabSize = 100; int maxDictionaryByteSize = 50; final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize); int fallBackThreshold = maxDictionaryByteSize; int dataSize=0; for (long i = 0; i < 100; i++) { Binary binary = Binary.fromString("str" + i); cw.writeBytes(binary); dataSize += (binary.length() + 4); if (dataSize < fallBackThreshold) { assertEquals(PLAIN_DICTIONARY, cw.getEncoding()); } else { assertEquals(PLAIN, cw.getEncoding()); } } //Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back ValuesReader reader = new BinaryPlainValuesReader(); reader.initFromPage(100, cw.getBytes().toInputStream()); for (long i = 0; i < 100; i++) { assertEquals(Binary.fromString("str" + i), reader.readBytes()); } //simulate cutting the page cw.reset(); assertEquals(0, cw.getBufferedSize()); }
Example #21
Source File: FilteringPrimitiveConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { for (ValueInspector valueInspector : valueInspectors) { valueInspector.update(value); } delegate.addBinary(value); }
Example #22
Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@Override @SuppressWarnings("unchecked") public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) { int id = ref.fieldId(); Long valueCount = valueCounts.get(id); if (valueCount == null) { // the column is not present and is all nulls return ROWS_CANNOT_MATCH; } Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id); if (colStats != null && !colStats.isEmpty()) { if (!colStats.hasNonNullValue()) { return ROWS_CANNOT_MATCH; } ByteBuffer prefixAsBytes = lit.toByteBuffer(); Comparator<ByteBuffer> comparator = Comparators.unsignedBytes(); Binary lower = colStats.genericGetMin(); // truncate lower bound so that its length in bytes is not greater than the length of prefix int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length()); int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes); if (lowerCmp > 0) { return ROWS_CANNOT_MATCH; } Binary upper = colStats.genericGetMax(); // truncate upper bound so that its length in bytes is not greater than the length of prefix int upperLength = Math.min(prefixAsBytes.remaining(), upper.length()); int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper.toByteBuffer(), upperLength), prefixAsBytes); if (upperCmp < 0) { return ROWS_CANNOT_MATCH; } } return ROWS_MIGHT_MATCH; }
Example #23
Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0 | 5 votes |
private Group createGroup(List<Supplier<?>> generators, Random random) { Group group = FACTORY.newGroup(); for (int column = 0, columnCnt = SCHEMA.getFieldCount(); column < columnCnt; ++column) { Type type = SCHEMA.getType(column); Supplier<?> generator = generators.get(column); // 2% chance of null value for an optional column if (generator == null || (type.isRepetition(OPTIONAL) && random.nextInt(50) == 0)) { continue; } switch (type.asPrimitiveType().getPrimitiveTypeName()) { case BINARY: case FIXED_LEN_BYTE_ARRAY: case INT96: group.append(type.getName(), (Binary) generator.get()); break; case INT32: group.append(type.getName(), (Integer) generator.get()); break; case INT64: group.append(type.getName(), (Long) generator.get()); break; case FLOAT: group.append(type.getName(), (Float) generator.get()); break; case DOUBLE: group.append(type.getName(), (Double) generator.get()); break; case BOOLEAN: group.append(type.getName(), (Boolean) generator.get()); break; } } return group; }
Example #24
Source File: TestDeltaLengthByteArray.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testRandomStrings() throws IOException { DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter(); DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader(); String[] values = Utils.getRandomStringSamples(1000, 32); Utils.writeData(writer, values); Binary[] bin = Utils.readData(reader, writer.getBytes().toInputStream(), values.length); for(int i =0; i< bin.length ; i++) { Assert.assertEquals(Binary.fromString(values[i]), bin[i]); } }
Example #25
Source File: ParquetTableSource.java From flink with Apache License 2.0 | 5 votes |
@Nullable private Tuple2<Column, Comparable> extractColumnAndLiteral(BinaryComparison comp) { TypeInformation<?> typeInfo = getLiteralType(comp); String columnName = getColumnName(comp); // fetch literal and ensure it is comparable Object value = getLiteral(comp); // validate that literal is comparable if (!(value instanceof Comparable)) { LOG.warn("Encountered a non-comparable literal of type {}." + "Cannot push predicate [{}] into ParquetTablesource." + "This is a bug and should be reported.", value.getClass().getCanonicalName(), comp); return null; } if (typeInfo == BasicTypeInfo.BYTE_TYPE_INFO || typeInfo == BasicTypeInfo.SHORT_TYPE_INFO || typeInfo == BasicTypeInfo.INT_TYPE_INFO) { return new Tuple2<>(FilterApi.intColumn(columnName), (Integer) value); } else if (typeInfo == BasicTypeInfo.LONG_TYPE_INFO) { return new Tuple2<>(FilterApi.longColumn(columnName), (Long) value); } else if (typeInfo == BasicTypeInfo.FLOAT_TYPE_INFO) { return new Tuple2<>(FilterApi.floatColumn(columnName), (Float) value); } else if (typeInfo == BasicTypeInfo.BOOLEAN_TYPE_INFO) { return new Tuple2<>(FilterApi.booleanColumn(columnName), (Boolean) value); } else if (typeInfo == BasicTypeInfo.DOUBLE_TYPE_INFO) { return new Tuple2<>(FilterApi.doubleColumn(columnName), (Double) value); } else if (typeInfo == BasicTypeInfo.STRING_TYPE_INFO) { return new Tuple2<>(FilterApi.binaryColumn(columnName), Binary.fromString((String) value)); } else { // unsupported type return null; } }
Example #26
Source File: DictionaryValuesWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void writeBytes(Binary value) { int id = binaryDictionaryContent.getInt(value); if (id == -1) { id = binaryDictionaryContent.size(); binaryDictionaryContent.put(value.copy(), id); dictionaryByteSize += length; } encodedValues.add(id); }
Example #27
Source File: DictionaryValuesWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public DictionaryPage toDictPageAndClose() { if (lastUsedDictionarySize > 0) { // return a dictionary only if we actually used it FixedLenByteArrayPlainValuesWriter dictionaryEncoder = new FixedLenByteArrayPlainValuesWriter(length, lastUsedDictionaryByteSize, maxDictionaryByteSize, allocator); Iterator<Binary> binaryIterator = binaryDictionaryContent.keySet().iterator(); // write only the part of the dict that we used for (int i = 0; i < lastUsedDictionarySize; i++) { Binary entry = binaryIterator.next(); dictionaryEncoder.writeBytes(entry); } return dictPage(dictionaryEncoder); } return null; }
Example #28
Source File: TestColumnIndexFiltering.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testSimpleFiltering() throws IOException { assertCorrectFiltering( record -> record.getId() == 1234, eq(longColumn("id"), 1234l)); assertCorrectFiltering( record -> "miller".equals(record.getName()), eq(binaryColumn("name"), Binary.fromString("miller"))); assertCorrectFiltering( record -> record.getName() == null, eq(binaryColumn("name"), null)); }
Example #29
Source File: LongDecimalColumnReader.java From presto with Apache License 2.0 | 5 votes |
@Override protected void readValue(BlockBuilder blockBuilder, Type prestoType) { if (!(prestoType instanceof DecimalType)) { throw new ParquetDecodingException(format("Unsupported Presto column type (%s) for Parquet column (%s)", prestoType, columnDescriptor)); } DecimalType prestoDecimalType = (DecimalType) prestoType; if (definitionLevel == columnDescriptor.getMaxDefinitionLevel()) { Binary binary = valuesReader.readBytes(); Slice value = Decimals.encodeUnscaledValue(new BigInteger(binary.getBytes())); if (prestoDecimalType.isShort()) { prestoType.writeLong(blockBuilder, longToShortCast( value, parquetDecimalType.getPrecision(), parquetDecimalType.getScale(), prestoDecimalType.getPrecision(), prestoDecimalType.getScale())); } else { prestoType.writeSlice(blockBuilder, longToLongCast( value, parquetDecimalType.getPrecision(), parquetDecimalType.getScale(), prestoDecimalType.getPrecision(), prestoDecimalType.getScale())); } } else if (isValueNull()) { blockBuilder.appendNull(); } }
Example #30
Source File: RowConverter.java From flink with Apache License 2.0 | 5 votes |
@Override public void addBinary(Binary value) { // in case it is a timestamp type stored as INT96 if (primitiveTypeName.equals(PrimitiveType.PrimitiveTypeName.INT96)) { parentDataHolder.add(pos, new Timestamp(ParquetTimestampUtils.getTimestampMillis(value))); return; } if (originalType != null) { switch (originalType) { case DECIMAL: parentDataHolder.add(pos, new BigDecimal(value.toStringUsingUTF8().toCharArray())); break; case UTF8: case ENUM: case JSON: case BSON: parentDataHolder.add(pos, value.toStringUsingUTF8()); break; default: throw new UnsupportedOperationException("Unsupported original type : " + originalType.name() + " for primitive type BINARY"); } } else { parentDataHolder.add(pos, value.toStringUsingUTF8()); } }