org.apache.parquet.hadoop.ParquetInputFormat Java Examples

The following examples show how to use org.apache.parquet.hadoop.ParquetInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoApacheInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public IPentahoRecordReader createRecordReader( IPentahoInputSplit split ) throws Exception {
  return inClassloader( () -> {
    PentahoInputSplitImpl pentahoInputSplit = (PentahoInputSplitImpl) split;
    InputSplit inputSplit = pentahoInputSplit.getInputSplit();

    ReadSupport<RowMetaAndData> readSupport = new PentahoParquetReadSupport();

    ParquetRecordReader<RowMetaAndData> nativeRecordReader =
      new ParquetRecordReader<>( readSupport, ParquetInputFormat.getFilter( job
        .getConfiguration() ) );
    TaskAttemptContextImpl task = new TaskAttemptContextImpl( job.getConfiguration(), new TaskAttemptID() );
    nativeRecordReader.initialize( inputSplit, task );

    return new PentahoParquetRecordReader( nativeRecordReader );
  } );
}
 
Example #2
Source File: PentahoApacheInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
public PentahoApacheInputFormat( NamedCluster namedCluster ) {
  logger.logBasic( "We are initializing parquet input format" );

  inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();

    if ( namedCluster != null ) {
      // if named cluster is not defined, no need to add cluster resource configs
      BiConsumer<InputStream, String> consumer = ( is, filename ) -> conf.addResource( is, filename );
      ShimConfigsLoader.addConfigsAsResources( namedCluster, consumer );
    }
    job = Job.getInstance( conf );

    nativeParquetInputFormat = new ParquetInputFormat<>();

    ParquetInputFormat.setReadSupportClass( job, PentahoParquetReadSupport.class );
    ParquetInputFormat.setTaskSideMetaData( job, false );
  } );
}
 
Example #3
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Uses a filter that drops all records to test handling of tasks (mappers) that need to do no work at all
 */
@Test
public void testReadWriteTaskSideMDAggressiveFilter() throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration();

  // this filter predicate should trigger row group filtering that drops all row-groups
  ParquetInputFormat.setFilterPredicate(conf, FilterApi.eq(FilterApi.intColumn("line"), -1000));
  final String fpString = conf.get(ParquetInputFormat.FILTER_PREDICATE);

  runMapReduceJob(CompressionCodecName.UNCOMPRESSED, new HashMap<String, String>() {{
    put("parquet.task.side.metadata", "true");
    put(ParquetInputFormat.FILTER_PREDICATE, fpString);
  }});

  File file = new File(outputPath.toString(), "part-m-00000");
  List<String> lines = Files.readAllLines(file.toPath(), StandardCharsets.UTF_8);
  assertTrue(lines.isEmpty());
}
 
Example #4
Source File: HDFSParquetImporter.java    From hudi with Apache License 2.0 5 votes vote down vote up
protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc,
    String schemaStr) throws IOException {
  Job job = Job.getInstance(jsc.hadoopConfiguration());
  // Allow recursive directories to be found
  job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
  // To parallelize reading file status.
  job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
  AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
  ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));

  return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
          job.getConfiguration())
      // To reduce large number of tasks.
      .coalesce(16 * cfg.parallelism).map(entry -> {
        GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
        Object partitionField = genericRecord.get(cfg.partitionKey);
        if (partitionField == null) {
          throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
        }
        Object rowField = genericRecord.get(cfg.rowKey);
        if (rowField == null) {
          throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
        }
        String partitionPath = partitionField.toString();
        LOG.debug("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
        if (partitionField instanceof Number) {
          try {
            long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
            partitionPath = PARTITION_FORMATTER.format(Instant.ofEpochMilli(ts));
          } catch (NumberFormatException nfe) {
            LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")");
          }
        }
        return new HoodieRecord<>(new HoodieKey(rowField.toString(), partitionPath),
            new HoodieJsonPayload(genericRecord.toString()));
      });
}
 
Example #5
Source File: PentahoApacheInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings( "squid:CommentedOutCodeLine" )
public void setSplitSize( long blockSize ) throws Exception {
  inClassloader( () ->
    /**
     * TODO Files splitting is temporary disabled. We need some UI checkbox for allow it, because some parquet files
     * can't be splitted by errors in previous implementation or other things. Parquet reports source of problem only
     * to logs, not to exception. See CorruptDeltaByteArrays.requiresSequentialReads().
     *
     * mapr510 and mapr520 doesn't support SPLIT_FILES property
     */
    // ParquetInputFormat.setMaxInputSplitSize( job, blockSize );
    job.getConfiguration().setBoolean( ParquetInputFormat.SPLIT_FILES, false )
  );
}
 
Example #6
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 5 votes vote down vote up
public TextRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat,
                           InputSplit oldSplit,
                           JobConf oldJobConf,
                           Reporter reporter) throws IOException {

    splitLen = oldSplit.getLength();

    try {
        ReadSupport<SimpleGroup> rs = ParquetInputFormat.getReadSupportInstance(oldJobConf);
        realReader = new ParquetRecordReader<>(rs);
        realReader.initialize(((StreamingParquetInputSplitWrapper)oldSplit).realSplit, oldJobConf, reporter);

        oldJobConf.set("map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());
        oldJobConf.set("mapreduce.map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());

        // read once to gain access to key and value objects
        if (realReader.nextKeyValue()) {

          firstRecord = true;
          valueContainer = new Container<>();
          SimpleGroup v = realReader.getCurrentValue();
          valueContainer.set(v);
          ls = groupToStrings(v);
        } else {

          eof = true;
        }
    } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
    }
}
 
Example #7
Source File: ParquetLoader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void setPushdownPredicate(Expression e) throws IOException {
  LOG.info("Pig pushdown expression: {}", e);

  FilterPredicate pred = buildFilter(e);
  LOG.info("Parquet filter predicate expression: {}", pred);

  storeInUDFContext(ParquetInputFormat.FILTER_PREDICATE, pred);
}
 
Example #8
Source File: ParquetLoader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetInputFormat<Tuple> getParquetInputFormat() throws ParserException {
  checkSetLocationHasBeenCalled();
  if (parquetInputFormat == null) {
    // unfortunately Pig will create many Loaders, so we cache the inputformat to avoid reading the metadata more than once
    Reference<ParquetInputFormat<Tuple>> ref = inputFormatCache.get(location);
    parquetInputFormat = ref == null ? null : ref.get();
    if (parquetInputFormat == null) {
      parquetInputFormat = new UnregisteringParquetInputFormat(location);
      inputFormatCache.put(location, new SoftReference<ParquetInputFormat<Tuple>>(parquetInputFormat));
    }
  }
  return parquetInputFormat;
}
 
Example #9
Source File: ParquetLoader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void setInput(String location, Job job) throws IOException {
  this.setLocationHasBeenCalled  = true;
  this.location = location;
  setInputPaths(job, location);

  //This is prior to load because the initial value comes from the constructor
  //not file metadata or pig framework and would get overwritten in initSchema().
  if(UDFContext.getUDFContext().isFrontend()) {
    storeInUDFContext(PARQUET_COLUMN_INDEX_ACCESS, Boolean.toString(columnIndexAccess));
  }

  schema = PigSchemaConverter.parsePigSchema(getPropertyFromUDFContext(PARQUET_PIG_SCHEMA));
  requiredFieldList = PigSchemaConverter.deserializeRequiredFieldList(getPropertyFromUDFContext(PARQUET_PIG_REQUIRED_FIELDS));
  columnIndexAccess = Boolean.parseBoolean(getPropertyFromUDFContext(PARQUET_COLUMN_INDEX_ACCESS));

  initSchema(job);

  if(UDFContext.getUDFContext().isFrontend()) {
    //Setting for task-side loading via initSchema()
    storeInUDFContext(PARQUET_PIG_SCHEMA, pigSchemaToString(schema));
    storeInUDFContext(PARQUET_PIG_REQUIRED_FIELDS, serializeRequiredFieldList(requiredFieldList));
  }

  //Used by task-side loader via TupleReadSupport
  getConfiguration(job).set(PARQUET_PIG_SCHEMA, pigSchemaToString(schema));
  getConfiguration(job).set(PARQUET_PIG_REQUIRED_FIELDS, serializeRequiredFieldList(requiredFieldList));
  getConfiguration(job).set(PARQUET_COLUMN_INDEX_ACCESS, Boolean.toString(columnIndexAccess));

  FilterPredicate filterPredicate = (FilterPredicate) getFromUDFContext(ParquetInputFormat.FILTER_PREDICATE);
  if(filterPredicate != null) {
    ParquetInputFormat.setFilterPredicate(getConfiguration(job), filterPredicate);
  }
}
 
Example #10
Source File: TestMapredParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
@Test
public void testConstructorWithParquetInputFormat() {
  new MapredParquetInputFormat(
      (ParquetInputFormat<ArrayWritable>) mock(ParquetInputFormat.class)
      );
}
 
Example #11
Source File: ParquetRecordReaderWrapper.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetRecordReaderWrapper(
    final ParquetInputFormat<ArrayWritable> newInputFormat,
    final InputSplit oldSplit,
    final JobConf oldJobConf,
    final Reporter reporter)
        throws IOException, InterruptedException {
  this(newInputFormat, oldSplit, oldJobConf, reporter,
      (new HiveBindingFactory()).create());
}
 
Example #12
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
 @Override
 public void sourceConfInit(FlowProcess<? extends JobConf> fp,
     Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {

   if (filterPredicate != null) {
     ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
   }

   jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
   ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
   TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}
 
Example #13
Source File: ParquetTBaseScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sourceConfInit(FlowProcess<? extends JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  super.sourceConfInit(fp, tap, jobConf);
  jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
  ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class);
  ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class);
}
 
Example #14
Source File: ParquetScroogeScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sourceConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  super.sourceConfInit(fp, tap, jobConf);
  jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
  ParquetInputFormat.setReadSupportClass(jobConf, ScroogeReadSupport.class);
  ThriftReadSupport.setRecordConverterClass(jobConf, ScroogeRecordConverter.class);
}
 
Example #15
Source File: DeprecatedParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public RecordReaderWrapper(
    InputSplit oldSplit, JobConf oldJobConf, Reporter reporter)
    throws IOException {
  splitLen = oldSplit.getLength();

  try {
    realReader = new ParquetRecordReader<V>(
        ParquetInputFormat.<V>getReadSupportInstance(oldJobConf),
        ParquetInputFormat.getFilter(oldJobConf));

    if (oldSplit instanceof ParquetInputSplitWrapper) {
      realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter);
    } else if (oldSplit instanceof FileSplit) {
      realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter);
    } else {
      throw new IllegalArgumentException(
          "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit);
    }

    // read once to gain access to key and value objects
    if (realReader.nextKeyValue()) {
      firstRecord = true;
      valueContainer = new Container<V>();
      valueContainer.set(realReader.getCurrentValue());

    } else {
      eof = true;
    }
  } catch (InterruptedException e) {
    Thread.interrupted();
    throw new IOException(e);
  }
}
 
Example #16
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
 @Override
 public void sourceConfInit(FlowProcess<JobConf> fp,
     Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {

   if (filterPredicate != null) {
     ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
   }

   jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
   ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
   TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}
 
Example #17
Source File: ParquetTBaseScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sourceConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  super.sourceConfInit(fp, tap, jobConf);
  jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
  ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class);
  ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class);
}
 
Example #18
Source File: ParquetValueScheme.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void setPredicatePushdown(JobConf jobConf) {
  if (this.config.filterPredicate != null) {
    ParquetInputFormat.setFilterPredicate(jobConf, this.config.filterPredicate);
  }
}
 
Example #19
Source File: MapredParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public MapredParquetInputFormat() {
  this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class));
}
 
Example #20
Source File: MapredParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
protected MapredParquetInputFormat(final ParquetInputFormat<ArrayWritable> inputFormat) {
  this.realInput = inputFormat;
}
 
Example #21
Source File: MapredParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public MapredParquetInputFormat(final ParquetInputFormat<ArrayWritable> realInputFormat) {
 super(realInputFormat);
}
 
Example #22
Source File: DeprecatedParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public DeprecatedParquetInputFormat(final ParquetInputFormat<ArrayWritable> realInputFormat) {
  super(realInputFormat);
}
 
Example #23
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testReadWriteFilter() throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration();

  // this filter predicate should keep some records but not all (first 500 characters)
  // "line" is actually position in the file...
  ParquetInputFormat.setFilterPredicate(conf, FilterApi.lt(FilterApi.intColumn("line"), 500));
  final String fpString = conf.get(ParquetInputFormat.FILTER_PREDICATE);

  runMapReduceJob(CompressionCodecName.UNCOMPRESSED, new HashMap<String, String>() {{
    put("parquet.task.side.metadata", "true");
    put(ParquetInputFormat.FILTER_PREDICATE, fpString);
  }});

  File file = new File(inputPath.toString());
  List<String> expected = Files.readAllLines(file.toPath(), StandardCharsets.UTF_8);

  // grab the lines that contain the first 500 characters (including the rest of the line past 500 characters)
  int size = 0;
  Iterator<String> iter = expected.iterator();
  while(iter.hasNext()) {
    String next = iter.next();

    if (size < 500) {
      size += next.length();
      continue;
    }

    iter.remove();
  }

  // put the output back into it's original format (remove the character counts / tabs)
  File file2 = new File(outputPath.toString(), "part-m-00000");
  List<String> found = Files.readAllLines(file2.toPath(), StandardCharsets.UTF_8);
  StringBuilder sbFound = new StringBuilder();
  for (String line : found) {
    sbFound.append(line.split("\t", -1)[1]);
    sbFound.append("\n");
  }

  sbFound.deleteCharAt(sbFound.length() - 1);

  assertEquals(String.join("\n", expected), sbFound.toString());
}
 
Example #24
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration(this.conf);
  for (Map.Entry<String, String> entry : extraConf.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    writeJob = new Job(conf, "write");
    TextInputFormat.addInputPath(writeJob, inputPath);
    writeJob.setInputFormatClass(TextInputFormat.class);
    writeJob.setNumReduceTasks(0);
    ParquetOutputFormat.setCompression(writeJob, codec);
    ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(readMapperClass);

    ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class);
    GroupWriteSupport.setSchema(
            MessageTypeParser.parseMessageType(writeSchema),
            writeJob.getConfiguration());
    writeJob.submit();
    waitForJob(writeJob);
  }
  {
    conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
    readJob = new Job(conf, "read");

    readJob.setInputFormatClass(ParquetInputFormat.class);
    ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class);

    ParquetInputFormat.setInputPaths(readJob, parquetPath);
    readJob.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(readJob, outputPath);
    readJob.setMapperClass(writeMapperClass);
    readJob.setNumReduceTasks(0);
    readJob.submit();
    waitForJob(readJob);
  }
}
 
Example #25
Source File: DeprecatedParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static boolean isTaskSideMetaData(JobConf job) {
  return job.getBoolean(ParquetInputFormat.TASK_SIDE_METADATA, TRUE);
}
 
Example #26
Source File: ParquetAsJsonInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public JsonRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat, InputSplit oldSplit,
        JobConf oldJobConf, Reporter reporter) throws IOException {
    super(newInputFormat, oldSplit, oldJobConf, reporter);
}
 
Example #27
Source File: ParquetValueScheme.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void setPredicatePushdown(JobConf jobConf) {
  if (this.config.filterPredicate != null) {
    ParquetInputFormat.setFilterPredicate(jobConf, this.config.filterPredicate);
  }
}
 
Example #28
Source File: DeprecatedParquetInputFormat.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public DeprecatedParquetInputFormat(final ParquetInputFormat<ArrayWritable> realInputFormat) {
  super();
}