Java Code Examples for org.apache.parquet.hadoop.ParquetInputFormat#setReadSupportClass()

The following examples show how to use org.apache.parquet.hadoop.ParquetInputFormat#setReadSupportClass() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HDFSParquetImporter.java    From hudi with Apache License 2.0 5 votes vote down vote up
protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc,
    String schemaStr) throws IOException {
  Job job = Job.getInstance(jsc.hadoopConfiguration());
  // Allow recursive directories to be found
  job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
  // To parallelize reading file status.
  job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
  AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
  ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));

  return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
          job.getConfiguration())
      // To reduce large number of tasks.
      .coalesce(16 * cfg.parallelism).map(entry -> {
        GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
        Object partitionField = genericRecord.get(cfg.partitionKey);
        if (partitionField == null) {
          throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
        }
        Object rowField = genericRecord.get(cfg.rowKey);
        if (rowField == null) {
          throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
        }
        String partitionPath = partitionField.toString();
        LOG.debug("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
        if (partitionField instanceof Number) {
          try {
            long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
            partitionPath = PARTITION_FORMATTER.format(Instant.ofEpochMilli(ts));
          } catch (NumberFormatException nfe) {
            LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")");
          }
        }
        return new HoodieRecord<>(new HoodieKey(rowField.toString(), partitionPath),
            new HoodieJsonPayload(genericRecord.toString()));
      });
}
 
Example 2
Source File: ParquetTBaseScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sourceConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  super.sourceConfInit(fp, tap, jobConf);
  jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
  ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class);
  ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class);
}
 
Example 3
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
 @Override
 public void sourceConfInit(FlowProcess<JobConf> fp,
     Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {

   if (filterPredicate != null) {
     ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
   }

   jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
   ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
   TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}
 
Example 4
Source File: ParquetScroogeScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sourceConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  super.sourceConfInit(fp, tap, jobConf);
  jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
  ParquetInputFormat.setReadSupportClass(jobConf, ScroogeReadSupport.class);
  ThriftReadSupport.setRecordConverterClass(jobConf, ScroogeRecordConverter.class);
}
 
Example 5
Source File: ParquetTBaseScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sourceConfInit(FlowProcess<? extends JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  super.sourceConfInit(fp, tap, jobConf);
  jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
  ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class);
  ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class);
}
 
Example 6
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
 @Override
 public void sourceConfInit(FlowProcess<? extends JobConf> fp,
     Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {

   if (filterPredicate != null) {
     ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
   }

   jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
   ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
   TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}
 
Example 7
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration(this.conf);
  for (Map.Entry<String, String> entry : extraConf.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    writeJob = new Job(conf, "write");
    TextInputFormat.addInputPath(writeJob, inputPath);
    writeJob.setInputFormatClass(TextInputFormat.class);
    writeJob.setNumReduceTasks(0);
    ParquetOutputFormat.setCompression(writeJob, codec);
    ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(readMapperClass);

    ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class);
    GroupWriteSupport.setSchema(
            MessageTypeParser.parseMessageType(writeSchema),
            writeJob.getConfiguration());
    writeJob.submit();
    waitForJob(writeJob);
  }
  {
    conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
    readJob = new Job(conf, "read");

    readJob.setInputFormatClass(ParquetInputFormat.class);
    ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class);

    ParquetInputFormat.setInputPaths(readJob, parquetPath);
    readJob.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(readJob, outputPath);
    readJob.setMapperClass(writeMapperClass);
    readJob.setNumReduceTasks(0);
    readJob.submit();
    waitForJob(readJob);
  }
}