parquet.io.api.Binary Java Exaples

Source File: TPCHQuery3Parquet.java From parquet-flinktacular with Apache License 2.0

6 votes

private static DataSet<Tuple2<Void, CustomerTable>> getCustomerDataSet(ExecutionEnvironment env) throws 
	IOException {
	Job job = Job.getInstance();

	ParquetInputFormat.setReadSupportClass(job, ThriftReadSupport.class);
	job.getConfiguration().set("parquet.thrift.column.filter", "ID;MKTSEGMENT");

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, 
		CustomerTable.class, job);

	// Filter market segment "AUTOMOBILE"
	BinaryColumn mktsegment = binaryColumn("MKTSEGMENT");
	FilterPredicate mktsegmentPred = eq(mktsegment, Binary.fromString("AUTOMOBILE"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), mktsegmentPred);

	ParquetThriftInputFormat.addInputPath(job, new Path(customerPath));

	DataSet<Tuple2<Void, CustomerTable>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: ParquetThriftExample.java From parquet-flinktacular with Apache License 2.0

6 votes

public static DataSet<Tuple2<Void, Person>> readThrift(ExecutionEnvironment env, String inputPath) throws 
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ParquetThriftInputFormat(), Void.class, Person
		.class, job);

	// schema projection: don't read attributes id and email
	job.getConfiguration().set("parquet.thrift.column.filter", "name;id;email;phone/number");

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: SimpleRecordConverter.java From parquet-tools with Apache License 2.0

6 votes

@Override
public void addBinary(Binary value) {
  byte[] data = value.getBytes();
  if (data == null) {
    record.add(name, null);
    return;
  }

  if (data != null) {
    try {
      CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer());
      record.add(name, buffer.toString());
      return;
    } catch (Throwable th) {
    }
  }

  record.add(name, value.getBytes());
}

Source File: ParquetConverter.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

private static long dateFromInt96( Binary value ) {
  byte[] readBuffer = value.getBytes();
  if ( readBuffer.length != 12 ) {
    throw new RuntimeException( "Invalid byte array length for INT96" );
  }

  long timeOfDayNanos =
    ( ( (long) readBuffer[ 7 ] << 56 ) + ( (long) ( readBuffer[ 6 ] & 255 ) << 48 )
      + ( (long) ( readBuffer[ 5 ] & 255 ) << 40 ) + ( (long) ( readBuffer[ 4 ] & 255 ) << 32 )
      + ( (long) ( readBuffer[ 3 ] & 255 ) << 24 ) + ( ( readBuffer[ 2 ] & 255 ) << 16 )
      + ( ( readBuffer[ 1 ] & 255 ) << 8 ) + ( readBuffer[ 0 ] & 255 ) );

  int julianDay =
    ( (int) ( readBuffer[ 11 ] & 255 ) << 24 ) + ( ( readBuffer[ 10 ] & 255 ) << 16 )
      + ( ( readBuffer[ 9 ] & 255 ) << 8 ) + ( readBuffer[ 8 ] & 255 );

  return ( julianDay - ParquetSpec.JULIAN_DAY_OF_EPOCH ) * 24L * 60L * 60L * 1000L + timeOfDayNanos / 1000000;
}

Source File: DumpCommand.java From parquet-tools with Apache License 2.0

5 votes

public static String binaryToString(Binary value) {
    byte[] data = value.getBytes();
    if (data == null) return null;

    try {
        CharBuffer buffer = UTF8_DECODER.decode(value.toByteBuffer());
        return buffer.toString();
    } catch (Throwable th) {
    }

    return "<bytes...>";
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

5 votes

public void add(int fieldIndex, Binary value) {
  switch (this.getType().getType(fieldIndex).asPrimitiveType().getPrimitiveTypeName()) {
    case BINARY:
      this.add(fieldIndex, new BinaryValue(value));
      break;
    case INT96:
      this.add(fieldIndex, new Int96Value(value));
      break;
    default:
      throw new UnsupportedOperationException(
          this.getType().asPrimitiveType().getName() + " not supported for Binary");
  }
}

Source File: ParquetProtobufExample.java From parquet-flinktacular with Apache License 2.0

5 votes

public static DataSet<Tuple2<Void, Person.Builder>> readProtobuf(ExecutionEnvironment env, String inputPath) 
	throws IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new ProtoParquetInputFormat(), Void.class, Person
		.Builder.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	//native predicate push down: read only records which satisfy a given constraint
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	//schema projection: don't read type of phone type attribute
	String projection = "message Person {\n" +
		"  required binary name (UTF8);\n" +
		"  required int32 id;\n" +
		"  optional binary email (UTF8);\n" +
		"  repeated group phone {\n" +
		"    required binary number (UTF8);\n" +
		"  }\n" +
		"}";
	ProtoParquetInputFormat.setRequestedProjection(job, projection);

	DataSet<Tuple2<Void, Person.Builder>> data = env.createInput(hadoopInputFormat);

	return data;
}

Source File: ParquetConverter.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

static BigDecimal binaryToDecimal( Binary value, int precision, int scale ) {
  /*
   * Precision <= 18 checks for the max number of digits for an unscaled long,
   * else treat with big integer conversion
   */
  if ( precision <= 18 ) {
    ByteBuffer buffer = value.toByteBuffer();
    byte[] bytes = buffer.array();
    int start = buffer.arrayOffset() + buffer.position();
    int end = buffer.arrayOffset() + buffer.limit();
    long unscaled = 0L;
    int i = start;
    while ( i < end ) {
      unscaled = ( unscaled << 8 | bytes[ i ] & 0xff );
      i++;
    }
    int bits = 8 * ( end - start );
    long unscaledNew = ( unscaled << ( 64 - bits ) ) >> ( 64 - bits );
    if ( unscaledNew <= -pow( 10, 18 ) || unscaledNew >= pow( 10, 18 ) ) {
      return new BigDecimal( unscaledNew );
    } else {
      return BigDecimal.valueOf( unscaledNew / pow( 10, scale ) );
    }
  } else {
    return new BigDecimal( new BigInteger( value.getBytes() ), scale );
  }
}

Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0

4 votes

@Override
BinaryValue convertField(JsonElement value) {
  return new BinaryValue(Binary.fromString(value.getAsString()));
}

Source File: SimpleRecordConverter.java From parquet-tools with Apache License 2.0

4 votes

@Override
public void addBinary(Binary value) {
  record.add(name, value.toStringUsingUTF8());
}

Source File: DumpCommand.java From parquet-tools with Apache License 2.0

4 votes

public static BigInteger binaryToBigInteger(Binary value) {
    byte[] data = value.getBytes();
    if (data == null) return null;

    return new BigInteger(data);
}

Source File: SliceDictionary.java From spliceengine with GNU Affero General Public License v3.0

4 votes

@Override
public Binary decodeToBinary(int id) {
    return Binary.fromConstantByteBuffer(slice[id].toByteBuffer());
}

Source File: SliceDictionary.java From spliceengine with GNU Affero General Public License v3.0

4 votes

@Override
public Binary decodeToBinary(int id) {
    return Binary.fromConstantByteBuffer(slice[id].toByteBuffer());
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

4 votes

public void add(int fieldIndex, String value) {
  this.add(fieldIndex, new BinaryValue(Binary.fromString(value)));
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

4 votes

public Binary getInt96(int fieldIndex, int index) {
  return ((Int96Value) this.getValue(fieldIndex, index)).getInt96();
}

Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0

4 votes

public Binary getBinary(int fieldIndex, int index) {
  return ((BinaryValue) this.getValue(fieldIndex, index)).getBinary();
}

Source File: ParquetAvroExample.java From parquet-flinktacular with Apache License 2.0

4 votes

public static DataSet<Tuple2<Void, Person>> readAvro(ExecutionEnvironment env, String inputPath) throws
	IOException {
	Job job = Job.getInstance();

	HadoopInputFormat hadoopInputFormat = new HadoopInputFormat(new AvroParquetInputFormat(), Void.class, Person
		.class, job);

	FileInputFormat.addInputPath(job, new Path(inputPath));

	// schema projection: don't read type of phonenumber     
	Schema phone = Schema.createRecord("PhoneNumber", null, null, false);
	phone.setFields(Arrays.asList(
		new Schema.Field("number", Schema.create(Schema.Type.BYTES), null, null)));

	Schema array = Schema.createArray(phone);
	Schema union = Schema.createUnion(Lists.newArrayList(Schema.create(Schema.Type.BYTES), Schema.create(Schema
		.Type
		.NULL)));


	Schema projection = Schema.createRecord("Person", null, null, false);
	projection.setFields(
		Arrays.asList(
			new Schema.Field("name", Schema.create(Schema.Type.BYTES), null, null),
			new Schema.Field("id", Schema.create(Schema.Type.INT), null, null),
			new Schema.Field("email", union, null, null),
			new Schema.Field("phone", array, null, null)
		)
	);

	AvroParquetInputFormat.setRequestedProjection(job, projection);

	// push down predicates: get all persons with name = "Felix"
	BinaryColumn name = binaryColumn("name");
	FilterPredicate namePred = eq(name, Binary.fromString("Felix"));
	ParquetInputFormat.setFilterPredicate(job.getConfiguration(), namePred);

	DataSet<Tuple2<Void, Person>> data = env.createInput(hadoopInputFormat);

	return data;
}

parquet.io.api.Binary Java Examples