parquet.io.api.Binary Java Examples

The following examples show how to use parquet.io.api.Binary. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TPCHQuery3Parquet.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
private static DataSet<Tuple2<Void, CustomerTable>> getCustomerDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;MKTSEGMENT");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		CustomerTable.class, job);

	// Filter market segment "AUTOMOBILE"
	BinaryColumn mktsegment = binaryColumn("MKTSEGMENT");
	FilterPredicate mktsegmentPred = eq(mktsegment, Binary.fromString("AUTOMOBILE"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), mktsegmentPred);

	ParquetThriftInputFormat.addInputPath(job, new Path(customerPath));

	DataSet<Tuple2<Void, CustomerTable>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #2
Source File: ParquetThriftExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws 
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person
		.class, job);

	// schema projection: don't read attributes id and email
	job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number");

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #3
Source File: SimpleRecordConverter.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
@Override
public void addBinary(Binary value) {
  byte[] data = value.getBytes();
  if (data == null) {
    record.add(name, null);
    return;
  }

  if (data != null) {
    try {
      CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer());
      record.add(name, buffer.toString());
      return;
    } catch (Throwable th) {
    }
  }

  record.add(name, value.getBytes());
}
 
Example #4
Source File: ParquetConverter.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
private static long dateFromInt96( Binary value ) {
  byte[] readBuffer = value.getBytes();
  if ( readBuffer.length != 12 ) {
    throw new RuntimeException( "Invalid byte array length for INT96" );
  }

  long timeOfDayNanos =
    ( ( (long) readBuffer[ 7 ] << 56 ) + ( (long) ( readBuffer[ 6 ] & 255 ) << 48 )
      + ( (long) ( readBuffer[ 5 ] & 255 ) << 40 ) + ( (long) ( readBuffer[ 4 ] & 255 ) << 32 )
      + ( (long) ( readBuffer[ 3 ] & 255 ) << 24 ) + ( ( readBuffer[ 2 ] & 255 ) << 16 )
      + ( ( readBuffer[ 1 ] & 255 ) << 8 ) + ( readBuffer[ 0 ] & 255 ) );

  int julianDay =
    ( (int) ( readBuffer[ 11 ] & 255 ) << 24 ) + ( ( readBuffer[ 10 ] & 255 ) << 16 )
      + ( ( readBuffer[ 9 ] & 255 ) << 8 ) + ( readBuffer[ 8 ] & 255 );

  return ( julianDay - ParquetSpec.JULIAN_DAY_OF_EPOCH ) * 24L * 60L * 60L * 1000L + timeOfDayNanos / 1000000;
}
 
Example #5
Source File: DumpCommand.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
public static String binaryToString(Binary value) {
    byte[] data = value.getBytes();
    if (data == null) return null;

    try {
        CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer());
        return buffer.toString();
    } catch (Throwable th) {
    }

    return "<bytes...>";
}
 
Example #6
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public void add(int fieldIndex, Binary value) {
  switch (this.getType().getType(fieldIndex).asPrimitiveType().getPrimitiveTypeName()) {
    case BINARY:
      this.add(fieldIndex, new BinaryValue(value));
      break;
    case INT96:
      this.add(fieldIndex, new Int96Value(value));
      break;
    default:
      throw new UnsupportedOperationException(
          this.getType().asPrimitiveType().getName() + " not supported for Binary");
  }
}
 
Example #7
Source File: ParquetProtobufExample.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
public static DataSet<Tuple2<Void, Person.Builder>> readProtobuf(ExecutionEnvironment env, String inputPath) 
	throws IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ProtoParquetInputFormat(), Void.class, Person
		.Builder.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	//native predicate push down: read only records which satisfy a given constraint
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	//schema projection: don't read type of phone type attribute
	String projection = "message Person {\n" +
		"  required binary name (UTF8);\n" +
		"  required int32 id;\n" +
		"  optional binary email (UTF8);\n" +
		"  repeated group phone {\n" +
		"    required binary number (UTF8);\n" +
		"  }\n" +
		"}";
	ProtoParquetInputFormat.setRequestedProjection(job, projection);

	DataSet<Tuple2<Void, Person.Builder>> data = env.createInput(hadoopInputFormat);

	return data;
}
 
Example #8
Source File: ParquetConverter.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
static BigDecimal binaryToDecimal( Binary value, int precision, int scale ) {
  /*
   * Precision <= 18 checks for the max number of digits for an unscaled long,
   * else treat with big integer conversion
   */
  if ( precision <= 18 ) {
    ByteBuffer buffer = value.toByteBuffer();
    byte[] bytes = buffer.array();
    int start = buffer.arrayOffset() + buffer.position();
    int end = buffer.arrayOffset() + buffer.limit();
    long unscaled = 0L;
    int i = start;
    while ( i < end ) {
      unscaled = ( unscaled << 8 | bytes[ i ] & 0xff );
      i++;
    }
    int bits = 8 * ( end - start );
    long unscaledNew = ( unscaled << ( 64 - bits ) ) >> ( 64 - bits );
    if ( unscaledNew <= -pow( 10, 18 ) || unscaledNew >= pow( 10, 18 ) ) {
      return new BigDecimal( unscaledNew );
    } else {
      return BigDecimal.valueOf( unscaledNew / pow( 10, scale ) );
    }
  } else {
    return new BigDecimal( new BigInteger( value.getBytes() ), scale );
  }
}
 
Example #9
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
BinaryValue convertField(JsonElement value) {
  return new BinaryValue(Binary.fromString(value.getAsString()));
}
 
Example #10
Source File: SimpleRecordConverter.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
@Override
public void addBinary(Binary value) {
  record.add(name, value.toStringUsingUTF8());
}
 
Example #11
Source File: DumpCommand.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
public static BigInteger binaryToBigInteger(Binary value) {
    byte[] data = value.getBytes();
    if (data == null) return null;

    return new BigInteger(data);
}
 
Example #12
Source File: SliceDictionary.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public Binary decodeToBinary(int id) {
    return Binary.fromConstantByteBuffer(slice[id].toByteBuffer());
}
 
Example #13
Source File: SliceDictionary.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public Binary decodeToBinary(int id) {
    return Binary.fromConstantByteBuffer(slice[id].toByteBuffer());
}
 
Example #14
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
public void add(int fieldIndex, String value) {
  this.add(fieldIndex, new BinaryValue(Binary.fromString(value)));
}
 
Example #15
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
public Binary getInt96(int fieldIndex, int index) {
  return ((Int96Value) this.getValue(fieldIndex, index)).getInt96();
}
 
Example #16
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
public Binary getBinary(int fieldIndex, int index) {
  return ((BinaryValue) this.getValue(fieldIndex, index)).getBinary();
}
 
Example #17
Source File: ParquetAvroExample.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
public static DataSet<Tuple2<Void, Person>> readAvro(ExecutionEnvironment env, String inputPath) throws
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new AvroParquetInputFormat(), Void.class, Person
		.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// schema projection: don't read type of phonenumber     
	Schema phone = Schema.createRecord("PhoneNumber", null, null, false);
	phone.setFields(Arrays.asList(
		new Schema.Field("number", Schema.create(Schema.Type.BYTES), null, null)));

	Schema array = Schema.createArray(phone);
	Schema union = Schema.createUnion(Lists.newArrayList(Schema.create(Schema.Type.BYTES), Schema.create(Schema
		.Type
		.NULL)));


	Schema projection = Schema.createRecord("Person", null, null, false);
	projection.setFields(
		Arrays.asList(
			new Schema.Field("name", Schema.create(Schema.Type.BYTES), null, null),
			new Schema.Field("id", Schema.create(Schema.Type.INT), null, null),
			new Schema.Field("email", union, null, null),
			new Schema.Field("phone", array, null, null)
		)
	);

	AvroParquetInputFormat.setRequestedProjection(job, projection);

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}