org.apache.parquet.avro.AvroReadSupport Java Examples

The following examples show how to use org.apache.parquet.avro.AvroReadSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetUtils.java    From nifi with Apache License 2.0 6 votes vote down vote up
public static void applyCommonConfig(Configuration conf, ParquetConfig parquetConfig) {
    if (parquetConfig.getAvroReadCompatibility() != null) {
        conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY,
                parquetConfig.getAvroReadCompatibility().booleanValue());
    }

    if (parquetConfig.getAvroAddListElementRecords() != null) {
        conf.setBoolean(AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS,
                parquetConfig.getAvroAddListElementRecords().booleanValue());
    }

    if (parquetConfig.getAvroWriteOldListStructure() != null) {
        conf.setBoolean(AvroWriteSupport.WRITE_OLD_LIST_STRUCTURE,
                parquetConfig.getAvroWriteOldListStructure().booleanValue());
    }
}
 
Example #2
Source File: ParquetFileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize() {
  Preconditions.checkState(state.equals(ReaderWriterState.NEW),
    "A reader may not be opened more than once - current state:%s", state);

  LOG.debug("Opening reader on path:{}", path);

  try {
    final Configuration conf = fileSystem.getConf();
    AvroReadSupport.setAvroReadSchema(conf, readerSchema);
    reader = new AvroParquetReader<E>(
        conf, fileSystem.makeQualified(path));
  } catch (IOException e) {
    throw new DatasetIOException("Unable to create reader path:" + path, e);
  }

  advance();

  state = ReaderWriterState.OPEN;
}
 
Example #3
Source File: ParquetFileReader.java    From kafka-connect-fs with Apache License 2.0 5 votes vote down vote up
private ParquetReader<GenericRecord> initReader() throws IOException {
    Configuration configuration = getFs().getConf();
    if (this.schema != null) {
        AvroReadSupport.setAvroReadSchema(configuration, this.schema);
    }
    if (this.projection != null) {
        AvroReadSupport.setRequestedProjection(configuration, this.projection);
    }
    return AvroParquetReader
            .<GenericRecord>builder(HadoopInputFile.fromPath(getFilePath(), configuration))
            .build();
}
 
Example #4
Source File: ParquetReadSupport.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection = hasIds(fileSchema) ?
    pruneColumns(fileSchema, expectedSchema) :
    pruneColumnsFallback(fileSchema, expectedSchema);

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}
 
Example #5
Source File: ParquetFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
  ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
  ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
  GenericRecord record;
  Schema schema = null;
  while ((record = parquetReader.read()) != null) {
    schema = avroData.toConnectSchema(record.getSchema());
  }
  parquetReader.close();
  return schema;
}
 
Example #6
Source File: ParquetFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
  Collection<Object> result = new ArrayList<>();
  AvroReadSupport<GenericRecord> readSupport = new AvroReadSupport<>();
  ParquetReader.Builder<GenericRecord> builder = ParquetReader.builder(readSupport, path);
  ParquetReader<GenericRecord> parquetReader = builder.withConf(conf).build();
  GenericRecord record;
  while ((record = parquetReader.read()) != null) {
    result.add(record);
  }
  parquetReader.close();
  return result;
}
 
Example #7
Source File: ParquetUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Read the rowKey list matching the given filter, from the given parquet file. If the filter is empty, then this will
 * return all the rowkeys.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @param filter        record keys filter
 * @param readSchema    schema of columns to be read
 * @return Set Set of row keys matching candidateRecordKeys
 */
private static Set<String> filterParquetRowKeys(Configuration configuration, Path filePath, Set<String> filter,
                                                Schema readSchema) {
  Option<RecordKeysFilterFunction> filterFunction = Option.empty();
  if (filter != null && !filter.isEmpty()) {
    filterFunction = Option.of(new RecordKeysFilterFunction(filter));
  }
  Configuration conf = new Configuration(configuration);
  conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
  AvroReadSupport.setAvroReadSchema(conf, readSchema);
  AvroReadSupport.setRequestedProjection(conf, readSchema);
  Set<String> rowKeys = new HashSet<>();
  try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
    Object obj = reader.read();
    while (obj != null) {
      if (obj instanceof GenericRecord) {
        String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        if (!filterFunction.isPresent() || filterFunction.get().apply(recordKey)) {
          rowKeys.add(recordKey);
        }
      }
      obj = reader.read();
    }
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read row keys from Parquet " + filePath, e);

  }
  // ignore
  return rowKeys;
}
 
Example #8
Source File: ParquetUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Fetch {@link HoodieKey}s from the given parquet file.
 *
 * @param filePath      The parquet file path.
 * @param configuration configuration to build fs object
 * @return {@link List} of {@link HoodieKey}s fetched from the parquet file
 */
public static List<HoodieKey> fetchRecordKeyPartitionPathFromParquet(Configuration configuration, Path filePath) {
  List<HoodieKey> hoodieKeys = new ArrayList<>();
  try {
    if (!filePath.getFileSystem(configuration).exists(filePath)) {
      return new ArrayList<>();
    }

    Configuration conf = new Configuration(configuration);
    conf.addResource(FSUtils.getFs(filePath.toString(), conf).getConf());
    Schema readSchema = HoodieAvroUtils.getRecordKeyPartitionPathSchema();
    AvroReadSupport.setAvroReadSchema(conf, readSchema);
    AvroReadSupport.setRequestedProjection(conf, readSchema);
    ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build();
    Object obj = reader.read();
    while (obj != null) {
      if (obj instanceof GenericRecord) {
        String recordKey = ((GenericRecord) obj).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
        String partitionPath = ((GenericRecord) obj).get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
        hoodieKeys.add(new HoodieKey(recordKey, partitionPath));
        obj = reader.read();
      }
    }
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read from Parquet file " + filePath, e);
  }
  return hoodieKeys;
}
 
Example #9
Source File: HDFSParquetImporter.java    From hudi with Apache License 2.0 5 votes vote down vote up
protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc,
    String schemaStr) throws IOException {
  Job job = Job.getInstance(jsc.hadoopConfiguration());
  // Allow recursive directories to be found
  job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
  // To parallelize reading file status.
  job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
  AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
  ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));

  return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
          job.getConfiguration())
      // To reduce large number of tasks.
      .coalesce(16 * cfg.parallelism).map(entry -> {
        GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
        Object partitionField = genericRecord.get(cfg.partitionKey);
        if (partitionField == null) {
          throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
        }
        Object rowField = genericRecord.get(cfg.rowKey);
        if (rowField == null) {
          throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
        }
        String partitionPath = partitionField.toString();
        LOG.debug("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
        if (partitionField instanceof Number) {
          try {
            long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
            partitionPath = PARTITION_FORMATTER.format(Instant.ofEpochMilli(ts));
          } catch (NumberFormatException nfe) {
            LOG.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")");
          }
        }
        return new HoodieRecord<>(new HoodieKey(rowField.toString(), partitionPath),
            new HoodieJsonPayload(genericRecord.toString()));
      });
}
 
Example #10
Source File: FileSystemViewKeyInputFormat.java    From kite with Apache License 2.0 5 votes vote down vote up
private static void setConfigProperties(Configuration conf, Format format,
                                        Schema schema, Class<?> type) {
  GenericData model = DataModelUtil.getDataModelForType(type);
  if (Formats.AVRO.equals(format)) {
    setModel.invoke(conf, model.getClass());
    conf.set(AVRO_SCHEMA_INPUT_KEY, schema.toString());

  } else if (Formats.PARQUET.equals(format)) {
    // TODO: update to a version of Parquet with setAvroDataSupplier
    //AvroReadSupport.setAvroDataSupplier(conf,
    //    DataModelUtil.supplierClassFor(model));
    AvroReadSupport.setAvroReadSchema(conf, schema);
  }
}
 
Example #11
Source File: ParquetReadSupport.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection;
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
  } else if (nameMapping != null) {
    MessageType typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
    projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
  } else {
    projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
  }

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}
 
Example #12
Source File: HoodieParquetReader.java    From hudi with Apache License 2.0 4 votes vote down vote up
@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
  AvroReadSupport.setAvroReadSchema(conf, schema);
  ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(path).withConf(conf).build();
  return new ParquetReaderIterator(reader);
}