Java Code Examples for org.apache.parquet.io.api.RecordMaterializer

The following examples show how to use org.apache.parquet.io.api.RecordMaterializer. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: parquet-mr   Source File: FilteringRecordMaterializer.java    License: Apache License 2.0 6 votes vote down vote up
public FilteringRecordMaterializer(
    RecordMaterializer<T> delegate,
    List<PrimitiveColumnIO> columnIOs,
    Map<ColumnPath, List<ValueInspector>> valueInspectorsByColumn,
    IncrementallyUpdatedFilterPredicate filterPredicate) {

  Objects.requireNonNull(columnIOs, "columnIOs cannot be null");
  Objects.requireNonNull(valueInspectorsByColumn, "valueInspectorsByColumn cannot be null");
  this.filterPredicate = Objects.requireNonNull(filterPredicate, "filterPredicate cannot be null");
  this.delegate = Objects.requireNonNull(delegate, "delegate cannot be null");

  // keep track of which path of indices leads to which primitive column
  Map<List<Integer>, PrimitiveColumnIO> columnIOsByIndexFieldPath = new HashMap<>();

  for (PrimitiveColumnIO c : columnIOs) {
    List<Integer> indexFieldPath = Arrays.stream(c.getIndexFieldPath())
        .boxed().collect(Collectors.toList());
    columnIOsByIndexFieldPath.put(indexFieldPath, c);
  }

  // create a proxy for the delegate's root converter
  this.rootConverter = new FilteringGroupConverter(
      delegate.getRootConverter(), Collections.emptyList(),
      valueInspectorsByColumn, columnIOsByIndexFieldPath);
}
 
Example 2
Source Project: parquet-mr   Source File: PerfTest.java    License: Apache License 2.0 6 votes vote down vote up
private static void read(MemPageStore memPageStore, MessageType myschema,
    String message) {
  MessageColumnIO columnIO = newColumnFactory(myschema);
  System.out.println(message);
  RecordMaterializer<Object> recordConsumer = new DummyRecordConverter(myschema);
  RecordReader<Object> recordReader = columnIO.getRecordReader(memPageStore, recordConsumer);

  read(recordReader, 2, myschema);
  read(recordReader, 10000, myschema);
  read(recordReader, 10000, myschema);
  read(recordReader, 10000, myschema);
  read(recordReader, 10000, myschema);
  read(recordReader, 10000, myschema);
  read(recordReader, 100000, myschema);
  read(recordReader, 1000000, myschema);
  System.out.println();
}
 
Example 3
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnInteger() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // Get first record
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter, FilterCompat.get(column("DocId", equalTo(10l))));

  readOne(recordReader, "r2 filtered out", r1);

  // Get second record
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", equalTo(20l))));

  readOne(recordReader, "r1 filtered out", r2);

}
 
Example 4
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testApplyFunctionFilterOnLong() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // Get first record
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", equalTo(10l))));

  readOne(recordReader, "r2 filtered out", r1);

  // Get second record
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("DocId", applyFunctionToLong(new LongGreaterThan15Predicate()))));

  readOne(recordReader, "r1 filtered out", r2);
}
 
Example 5
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 6);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(page(4, 4)));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting records " + all, 4, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record", (i%2 == 0 ? r2 : r1).toString(), all.get(i).toString());
  }
}
 
Example 6
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteredAndPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(and(column("DocId", equalTo(10l)), page(2, 4))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 4 records " + all, 4, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record1", r1.toString(), all.get(i).toString());
  }

}
 
Example 7
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteredOrPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(or(column("DocId", equalTo(10l)),
              column("DocId", equalTo(20l)))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 8 records " + all, 16, all.size());
  for (int i = 0; i < all.size () / 2; i++) {
    assertEquals("expecting record1", r1.toString(), all.get(2 * i).toString());
    assertEquals("expecting record2", r2.toString(), all.get(2 * i + 1).toString());
  }
}
 
Example 8
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteredNotPaged() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 8);

  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(not(column("DocId", equalTo(10l)))));

  List<Group> all = readAll(recordReader);
  assertEquals("expecting 8 records " + all, 8, all.size());
  for (int i = 0; i < all.size(); i++) {
    assertEquals("expecting record2", r2.toString(), all.get(i).toString());
  }
}
 
Example 9
Source Project: parquet-mr   Source File: ThriftReadSupport.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration,
    Map<String, String> keyValueMetaData, MessageType fileSchema,
    org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  ThriftMetaData thriftMetaData = ThriftMetaData.fromExtraMetaData(keyValueMetaData);
  try {
    initThriftClass(thriftMetaData, configuration);
  } catch (ClassNotFoundException e) {
    throw new RuntimeException("Cannot find Thrift object class for metadata: " + thriftMetaData, e);
  }

  // if there was not metadata in the file, get it from requested class
  if (thriftMetaData == null) {
    thriftMetaData = ThriftMetaData.fromThriftClass(thriftClass);
  }

  String converterClassName = configuration.get(RECORD_CONVERTER_CLASS_KEY, RECORD_CONVERTER_DEFAULT);
  return getRecordConverterInstance(converterClassName, thriftClass,
      readContext.getRequestedSchema(), thriftMetaData.getDescriptor(),
      configuration);
}
 
Example 10
Source Project: parquet-mr   Source File: ProtoReadSupport.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext) {
  String headerProtoClass = keyValueMetaData.get(PB_CLASS);
  String configuredProtoClass = configuration.get(PB_CLASS);

  if (configuredProtoClass != null) {
    LOG.debug("Replacing class " + headerProtoClass + " by " + configuredProtoClass);
    headerProtoClass = configuredProtoClass;
  }

  if (headerProtoClass == null) {
    throw new RuntimeException("I Need parameter " + PB_CLASS + " with Protocol Buffer class");
  }

  LOG.debug("Reading data with Protocol Buffer class {}", headerProtoClass);

  MessageType requestedSchema = readContext.getRequestedSchema();
  Class<? extends Message> protobufClass = Protobufs.getProtobufClass(headerProtoClass);
  return new ProtoRecordMaterializer(requestedSchema, protobufClass);
}
 
Example 11
Source Project: parquet-mr   Source File: TupleReadSupport.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordMaterializer<Tuple> prepareForRead(
    Configuration configuration,
    Map<String, String> keyValueMetaData,
    MessageType fileSchema,
    ReadContext readContext) {
  MessageType requestedSchema = readContext.getRequestedSchema();
  Schema requestedPigSchema = getPigSchema(configuration);

  if (requestedPigSchema == null) {
    throw new ParquetDecodingException("Missing Pig schema: ParquetLoader sets the schema in the job conf");
  }
  boolean elephantBirdCompatible = configuration.getBoolean(PARQUET_PIG_ELEPHANT_BIRD_COMPATIBLE, false);
  boolean columnIndexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
  if (elephantBirdCompatible) {
    LOG.info("Numbers will default to 0 instead of NULL; Boolean will be converted to Int");
  }
  return new TupleRecordMaterializer(requestedSchema, requestedPigSchema, elephantBirdCompatible, columnIndexAccess);
}
 
Example 12
Source Project: parquet-mr   Source File: TestTupleRecordConsumer.java    License: Apache License 2.0 6 votes vote down vote up
private void testFromTuple(String pigSchemaString, List<Tuple> input) throws Exception {
  List<Tuple> tuples = new ArrayList<Tuple>();
  RecordMaterializer<Tuple> recordConsumer = newPigRecordConsumer(pigSchemaString);
  TupleWriteSupport tupleWriter = newTupleWriter(pigSchemaString, recordConsumer);
  for (Tuple tuple : input) {
    LOG.debug("{}", tuple);
    tupleWriter.write(tuple);
    tuples.add(recordConsumer.getCurrentRecord());
  }

  assertEquals(input.size(), tuples.size());
  for (int i = 0; i < input.size(); i++) {
    Tuple in = input.get(i);
    Tuple out = tuples.get(i);
    assertEquals(in.toString(), out.toString());
  }

}
 
Example 13
Source Project: parquet-mr   Source File: TupleConsumerPerfTest.java    License: Apache License 2.0 6 votes vote down vote up
private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
    System.out.println(message);
    MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
    TupleReadSupport tupleReadSupport = new TupleReadSupport();
    Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
    MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
    ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
    RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
    RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
    // TODO: put this back
//  if (DEBUG) {
//    recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
//  }
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 100000, pigSchemaString);
    read(recordReader, 1000000, pigSchemaString);
    System.out.println();
  }
 
Example 14
Source Project: iceberg   Source File: ParquetReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration,
                                            Map<String, String> fileMetadata,
                                            MessageType fileMessageType,
                                            ReadContext readContext) {
  // This is the type created in init that was based on the file's schema. The schema that this
  // will pass to the wrapped ReadSupport needs to match the expected schema's names. Rather than
  // renaming the file's schema, convert the expected schema to Parquet. This relies on writing
  // files with the correct schema.
  // TODO: this breaks when columns are reordered.
  MessageType readSchema = ParquetSchemaUtil.convert(expectedSchema, fileMessageType.getName());
  return wrapped.prepareForRead(configuration, fileMetadata, readSchema, readContext);
}
 
Example 15
Source Project: iceberg   Source File: ParquetReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration,
                                            Map<String, String> fileMetadata,
                                            MessageType fileMessageType,
                                            ReadContext readContext) {
  // This is the type created in init that was based on the file's schema. The schema that this
  // will pass to the wrapped ReadSupport needs to match the expected schema's names. Rather than
  // renaming the file's schema, convert the expected schema to Parquet. This relies on writing
  // files with the correct schema.
  // TODO: this breaks when columns are reordered.
  MessageType readSchema = ParquetSchemaUtil.convert(expectedSchema, fileMessageType.getName());
  return wrapped.prepareForRead(configuration, fileMetadata, readSchema, readContext);
}
 
Example 16
Source Project: tajo   Source File: TajoReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Prepares for read.
 *
 * @param configuration The job configuration.
 * @param keyValueMetaData App-specific metadata from the file.
 * @param fileSchema The schema of the Parquet file.
 * @param readContext Returned by the init method.
 */
@Override
public RecordMaterializer<Tuple> prepareForRead(
    Configuration configuration,
    Map<String, String> keyValueMetaData,
    MessageType fileSchema,
    ReadContext readContext) {
  MessageType parquetRequestedSchema = readContext.getRequestedSchema();
  return new TajoRecordMaterializer(parquetRequestedSchema, requestedSchema, readSchema);
}
 
Example 17
Source Project: parquet-mr   Source File: FilteredRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param root          the root of the schema
 * @param validating
 * @param columnStore
 * @param unboundFilter Filter records, pass in NULL_FILTER to leave unfiltered.
 */
public FilteredRecordReader(MessageColumnIO root, RecordMaterializer<T> recordMaterializer, boolean validating,
                            ColumnReadStoreImpl columnStore, UnboundRecordFilter unboundFilter, long recordCount) {
  super(root, recordMaterializer, validating, columnStore);
  this.recordCount = recordCount;
  if ( unboundFilter != null ) {
    recordFilter = unboundFilter.bind(getColumnReaders());
  } else {
    recordFilter = null;
  }
}
 
Example 18
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://A"))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}
 
Example 19
Source Project: parquet-mr   Source File: TestFiltered.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testApplyFunctionFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", applyFunctionToString(new StringEndsWithAPredicate()))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}
 
Example 20
Source Project: parquet-mr   Source File: DelegatingReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(
    Configuration configuration,
    Map<String, String> keyValueMetaData,
    MessageType fileSchema,
    ReadSupport.ReadContext readContext) {
  return delegate.prepareForRead(configuration, keyValueMetaData, fileSchema, readContext);
}
 
Example 21
Source Project: parquet-mr   Source File: TupleReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RecordMaterializer<Tuple> prepareForRead(
    Configuration configuration,
    Map<String, String> keyValueMetaData,
    MessageType fileSchema,
    ReadContext readContext) {
  MessageType requestedSchema = readContext.getRequestedSchema();
  return new TupleRecordMaterializer(requestedSchema);
}
 
Example 22
Source Project: parquet-mr   Source File: DataWritableReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the hive read support to interpret data from parquet to hive
 *
 * @param configuration // unused
 * @param keyValueMetaData string map of metadata
 * @param fileSchema // unused
 * @param readContext containing the requested schema and the schema of the hive table
 * @return Record Materialize for Hive
 */
@Override
public RecordMaterializer<ArrayWritable> prepareForRead(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema,
        final org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  final Map<String, String> metadata = readContext.getReadSupportMetadata();
  if (metadata == null) {
    throw new IllegalStateException("ReadContext not initialized properly. " +
      "Don't know the Hive Schema.");
  }
  final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
      parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);

  return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
}
 
Example 23
Source Project: parquet-mr   Source File: TestTupleRecordConsumer.java    License: Apache License 2.0 5 votes vote down vote up
private void testFromGroups(String pigSchemaString, List<Group> input) throws ParserException {
  List<Tuple> tuples = new ArrayList<Tuple>();
  MessageType schema = getMessageType(pigSchemaString);
  RecordMaterializer<Tuple> pigRecordConsumer = newPigRecordConsumer(pigSchemaString);
  GroupWriter groupWriter = new GroupWriter(new RecordConsumerLoggingWrapper(new ConverterConsumer(pigRecordConsumer.getRootConverter(), schema)), schema);

  for (Group group : input) {
    groupWriter.write(group);
    final Tuple tuple = pigRecordConsumer.getCurrentRecord();
    tuples.add(tuple);
    LOG.debug("in: {}\nout:{}", group, tuple);
  }

  List<Group> groups = new ArrayList<Group>();
  GroupRecordConverter recordConsumer = new GroupRecordConverter(schema);
  TupleWriteSupport tupleWriter = newTupleWriter(pigSchemaString, recordConsumer);
  for (Tuple t : tuples) {
    LOG.debug("{}", t);
    tupleWriter.write(t);
    groups.add(recordConsumer.getCurrentRecord());
  }

  assertEquals(input.size(), groups.size());
  for (int i = 0; i < input.size(); i++) {
    Group in = input.get(i);
    LOG.debug("{}", in);
    Group out = groups.get(i);
    assertEquals(in.toString(), out.toString());
  }
}
 
Example 24
Source Project: parquet-mr   Source File: TestTupleRecordConsumer.java    License: Apache License 2.0 5 votes vote down vote up
private <T> TupleWriteSupport newTupleWriter(String pigSchemaString, RecordMaterializer<T> recordConsumer) throws ParserException {
  TupleWriteSupport tupleWriter = TupleWriteSupport.fromPigSchema(pigSchemaString);
  tupleWriter.init(null);
  tupleWriter.prepareForWrite(
      new ConverterConsumer(recordConsumer.getRootConverter(), tupleWriter.getParquetSchema())
      );
  return tupleWriter;
}
 
Example 25
Source Project: parquet-mr   Source File: TestTupleRecordConsumer.java    License: Apache License 2.0 5 votes vote down vote up
private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException {
  TupleReadSupport tupleReadSupport = new TupleReadSupport();
  final Configuration configuration = new Configuration(false);
  MessageType parquetSchema = getMessageType(pigSchemaString);
  final Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
  Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>();
  for (Entry<String, String> entry : pigMetaData.entrySet()) {
    globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue())));
  }
  configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString);
  final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema));
  return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init);
}
 
Example 26
Source Project: parquet-mr   Source File: AvroReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(
    Configuration configuration, Map<String, String> keyValueMetaData,
    MessageType fileSchema, ReadContext readContext) {
  Map<String, String> metadata = readContext.getReadSupportMetadata();
  MessageType parquetSchema = readContext.getRequestedSchema();
  Schema avroSchema;

  if (metadata.get(AVRO_READ_SCHEMA_METADATA_KEY) != null) {
    // use the Avro read schema provided by the user
    avroSchema = new Schema.Parser().parse(metadata.get(AVRO_READ_SCHEMA_METADATA_KEY));
  } else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) {
    // use the Avro schema from the file metadata if present
    avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY));
  } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) {
    // use the Avro schema from the file metadata if present
    avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY));
  } else {
    // default to converting the Parquet schema into an Avro schema
    avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema);
  }

  GenericData model = getDataModel(configuration);
  String compatEnabled = metadata.get(AvroReadSupport.AVRO_COMPATIBILITY);
  if (compatEnabled != null && Boolean.valueOf(compatEnabled)) {
    return newCompatMaterializer(parquetSchema, avroSchema, model);
  }
  return new AvroRecordMaterializer<T>(parquetSchema, avroSchema, model);
}
 
Example 27
Source Project: flink   Source File: RowReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<Row> prepareForRead(
	Configuration configuration, Map<String, String> keyValueMetaData,
	MessageType fileSchema, ReadContext readContext) {
	return new RowMaterializer(readContext.getRequestedSchema(), returnTypeInfo);
}
 
Example 28
Source Project: flink   Source File: RowReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<Row> prepareForRead(
	Configuration configuration, Map<String, String> keyValueMetaData,
	MessageType fileSchema, ReadContext readContext) {
	return new RowMaterializer(readContext.getRequestedSchema(), returnTypeInfo);
}
 
Example 29
@Override
public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
    MessageType schema, ReadContext context) {
  return new GroupRecordConverter(schema);
}
 
Example 30
Source Project: parquet-mr   Source File: SimpleReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<SimpleRecord> prepareForRead(Configuration conf, Map<String,String> metaData, MessageType schema, ReadContext context) {
  return new SimpleRecordMaterializer(schema);
}