parquet.hadoop.ParquetOutputFormat Java Examples

The following examples show how to use parquet.hadoop.ParquetOutputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetAvroExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	// Set up the Hadoop Input Format
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	AvroParquetOutputFormat.setSchema(job, Person.getClassSchema());
	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example #2
Source File: ParquetThriftExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static void writeThrift(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	// Set up the Hadoop Input Format
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	ParquetThriftOutputFormat.setThriftClass(job, Person.class);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example #3
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setOutputFile( String file, boolean override ) throws Exception {
  inClassloader( () -> {
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, job.getConfiguration() );
    outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( outputFile.toUri(), job.getConfiguration() );
    if ( fs.exists( outputFile ) ) {
      if ( override ) {
        fs.delete( outputFile, true );
      } else {
        throw new FileAlreadyExistsException( file );
      }
    }
    ParquetOutputFormat.setOutputPath( job, outputFile.getParent() );
  } );
}
 
Example #4
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setVersion( VERSION version ) throws Exception {
  inClassloader( () -> {
    ParquetProperties.WriterVersion writerVersion;
    switch ( version ) {
      case VERSION_1_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_1_0;
        break;
      case VERSION_2_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
      default:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
    }
    job.getConfiguration().set( ParquetOutputFormat.WRITER_VERSION, writerVersion.toString() );
  } );
}
 
Example #5
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setCompression( COMPRESSION comp ) throws Exception {
  inClassloader( () -> {
    CompressionCodecName codec;
    switch ( comp ) {
      case SNAPPY:
        codec = CompressionCodecName.SNAPPY;
        break;
      case GZIP:
        codec = CompressionCodecName.GZIP;
        break;
      case LZO:
        codec = CompressionCodecName.LZO;
        break;
      default:
        codec = CompressionCodecName.UNCOMPRESSED;
        break;
    }
    ParquetOutputFormat.setCompression( job, codec );
  } );
}
 
Example #6
Source File: ParquetProtobufExample.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
public static void writeProtobuf(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ProtoParquetOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	ProtoParquetOutputFormat.setProtobufClass(job, Person.class);
	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example #7
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
public PentahoTwitterOutputFormat()  {
  logger.info( "We are initializing parquet output format" );

  inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();

    job = Job.getInstance( conf );

    job.getConfiguration().set( ParquetOutputFormat.ENABLE_JOB_SUMMARY, "false" );
    ParquetOutputFormat.setEnableDictionary( job, false );
  } );
}
 
Example #8
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void enableDictionary( boolean useDictionary ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setEnableDictionary( job, useDictionary ) );
}
 
Example #9
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void setRowGroupSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setBlockSize( job, size ) );
}
 
Example #10
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void setDataPageSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setPageSize( job, size ) );
}
 
Example #11
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void setDictionaryPageSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setDictionaryPageSize( job, size ) );
}