parquet.hadoop.example.GroupWriteSupport Java Examples

The following examples show how to use parquet.hadoop.example.GroupWriteSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: ParquetDataWriterBuilder.java From incubator-gobblin with Apache License 2.0

4 votes

/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
    throws IOException {

  CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
  ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
      .fromString(writerConfiguration.getWriterVersion());

  Configuration conf = new Configuration();
  ParquetWriter versionSpecificWriter = null;
  switch (writerConfiguration.getRecordFormat()) {
    case GROUP: {
      GroupWriteSupport.setSchema((MessageType) this.schema, conf);
      WriteSupport support = new GroupWriteSupport();
      versionSpecificWriter = new ParquetWriter<Group>(
          writerConfiguration.getAbsoluteStagingFile(),
          support,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.getDictPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate(),
          writerVersion,
          conf);
      break;
    }
    case AVRO:  {
      versionSpecificWriter = new AvroParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Schema) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          conf);
      break;
    }
    case PROTOBUF: {
      versionSpecificWriter = new ProtoParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Class<? extends Message>) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate());
      break;
    }
    default: throw new RuntimeException("Record format not supported");
  }
  ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;

  return new ParquetWriterShim() {
    @Override
    public void write(Object record)
        throws IOException {
      finalVersionSpecificWriter.write(record);
    }

    @Override
    public void close()
        throws IOException {
      finalVersionSpecificWriter.close();
    }
  };
}

Example #2

Source File: ExampleParquetMapReduce.java From hiped2 with Apache License 2.0

4 votes

@Override
protected void setup(Context context) throws IOException, InterruptedException {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
}

Example #3

Source File: TestReadWriteParquet.java From parquet-examples with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }