Java Code Examples for org.apache.parquet.hadoop.api.ReadSupport

The following examples show how to use org.apache.parquet.hadoop.api.ReadSupport. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tajo   Source File: InternalParquetRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example 2
Source Project: tajo   Source File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example 3
Source Project: parquet-mr   Source File: InternalParquetRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example 4
Source Project: parquet-mr   Source File: ThriftReadSupport.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration,
    Map<String, String> keyValueMetaData, MessageType fileSchema,
    org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  ThriftMetaData thriftMetaData = ThriftMetaData.fromExtraMetaData(keyValueMetaData);
  try {
    initThriftClass(thriftMetaData, configuration);
  } catch (ClassNotFoundException e) {
    throw new RuntimeException("Cannot find Thrift object class for metadata: " + thriftMetaData, e);
  }

  // if there was not metadata in the file, get it from requested class
  if (thriftMetaData == null) {
    thriftMetaData = ThriftMetaData.fromThriftClass(thriftClass);
  }

  String converterClassName = configuration.get(RECORD_CONVERTER_CLASS_KEY, RECORD_CONVERTER_DEFAULT);
  return getRecordConverterInstance(converterClassName, thriftClass,
      readContext.getRequestedSchema(), thriftMetaData.getDescriptor(),
      configuration);
}
 
Example 5
Source Project: iow-hadoop-streaming   Source File: GroupReadSupport.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public ReadContext init(
        Configuration configuration, Map<String, String> keyValueMetaData,
        MessageType fileSchema) {

    String partialSchemaString;
    String partialSchemaFile = configuration.get(PARQUET_READ_SCHEMA_FILE, "");
    if (!partialSchemaFile.isEmpty()) {
        StringBuilder r = new StringBuilder();
        try {
            BufferedReader br = new BufferedReader(new FileReader(new File(partialSchemaFile)));
            String line;
            while ((line = br.readLine()) != null)
                r.append(line);
        } catch (Exception e) {
            throw new RuntimeException("Can't read schema from file " + partialSchemaFile + ": " + e.getMessage());
        }

        partialSchemaString = r.toString();
    }
    else
        partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

    return new ReadContext(getSchemaForRead(fileSchema, partialSchemaString));
}
 
Example 6
@Override
public IPentahoRecordReader createRecordReader( IPentahoInputSplit split ) throws Exception {
  return inClassloader( () -> {
    PentahoInputSplitImpl pentahoInputSplit = (PentahoInputSplitImpl) split;
    InputSplit inputSplit = pentahoInputSplit.getInputSplit();

    ReadSupport<RowMetaAndData> readSupport = new PentahoParquetReadSupport();

    ParquetRecordReader<RowMetaAndData> nativeRecordReader =
      new ParquetRecordReader<>( readSupport, ParquetInputFormat.getFilter( job
        .getConfiguration() ) );
    TaskAttemptContextImpl task = new TaskAttemptContextImpl( job.getConfiguration(), new TaskAttemptID() );
    nativeRecordReader.initialize( inputSplit, task );

    return new PentahoParquetRecordReader( nativeRecordReader );
  } );
}
 
Example 7
Source Project: flink   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 8
Source Project: tajo   Source File: TajoReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Initializes the ReadSupport.
 *
 * @param context The InitContext.
 * @return A ReadContext that defines how to read the file.
 */
@Override
public ReadSupport.ReadContext init(InitContext context) {
  if (requestedSchema == null) {
    throw new RuntimeException("requestedSchema is null.");
  }
  MessageType requestedParquetSchema =
    new TajoSchemaConverter().convert(requestedSchema);
  LOG.debug("Reading data with projection:\n" + requestedParquetSchema);
  return new ReadContext(requestedParquetSchema);
}
 
Example 9
Source Project: flink   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 10
Source Project: parquet-mr   Source File: FilteringBenchmarks.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetReader.Builder<Group> createReaderBuilder() throws IOException {
  ReadConfigurator readConfigurator = getReadConfigurator();
  return readConfigurator.configureBuilder(
      new ParquetReader.Builder<Group>(HadoopInputFile.fromPath(file, new Configuration())) {
        @Override
        protected ReadSupport<Group> getReadSupport() {
          return new GroupReadSupport();
        }
      }.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString()));
}
 
Example 11
Source Project: parquet-mr   Source File: GroupReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(
    Configuration configuration, Map<String, String> keyValueMetaData,
    MessageType fileSchema) {
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);
  MessageType requestedProjection = getSchemaForRead(fileSchema, partialSchemaString);
  return new ReadContext(requestedProjection);
}
 
Example 12
Source Project: parquet-mr   Source File: InternalParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example 13
Source Project: parquet-mr   Source File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}
 
Example 14
Source Project: parquet-mr   Source File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
private ParquetReader(List<InputFile> files,
                      ParquetReadOptions options,
                      ReadSupport<T> readSupport) throws IOException {
  this.readSupport = readSupport;
  this.options = options;
  this.filesIterator = files.iterator();
}
 
Example 15
Source Project: parquet-mr   Source File: ParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
@Deprecated
private Builder(ReadSupport<T> readSupport, Path path) {
  this.readSupport = Objects.requireNonNull(readSupport, "readSupport cannot be null");
  this.file = null;
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.conf = new Configuration();
  this.optionsBuilder = HadoopReadOptions.builder(conf);
}
 
Example 16
Source Project: parquet-mr   Source File: ParquetInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public RecordReader<Void, T> createRecordReader(
    InputSplit inputSplit,
    TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  Configuration conf = ContextUtil.getConfiguration(taskAttemptContext);
  ReadSupport<T> readSupport = getReadSupport(conf);
  return new ParquetRecordReader<T>(readSupport, getFilter(conf));
}
 
Example 17
Source Project: parquet-mr   Source File: ParquetInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param configuration to find the configuration for the read support
 * @return the configured read support
 * @deprecated use getReadSupportInstance static methods instead
 */
@Deprecated
@SuppressWarnings("unchecked")
ReadSupport<T> getReadSupport(Configuration configuration){
  return getReadSupportInstance(readSupportClass == null ?
      (Class<? extends ReadSupport<T>>) getReadSupportClass(configuration) :
      readSupportClass);
}
 
Example 18
Source Project: parquet-mr   Source File: ParquetInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param readSupportClass to instantiate
 * @param <T> the Java type of objects created by the ReadSupport
 * @return the configured read support
 */
@SuppressWarnings("unchecked")
static <T> ReadSupport<T> getReadSupportInstance(
    Class<? extends ReadSupport<T>> readSupportClass){
  try {
    return readSupportClass.newInstance();
  } catch (InstantiationException | IllegalAccessException e) {
    throw new BadConfigurationException("could not instantiate read support class", e);
  }
}
 
Example 19
Source Project: parquet-mr   Source File: GroupReadSupportTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testInitWithoutSpecifyingRequestSchema() throws Exception {
  GroupReadSupport s = new GroupReadSupport();
  Configuration configuration = new Configuration();
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  MessageType fileSchema = MessageTypeParser.parseMessageType(fullSchemaStr);

  ReadSupport.ReadContext context = s.init(configuration, keyValueMetaData, fileSchema);
  assertEquals(context.getRequestedSchema(), fileSchema);
}
 
Example 20
Source Project: parquet-mr   Source File: GroupReadSupportTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testInitWithPartialSchema() {
  GroupReadSupport s = new GroupReadSupport();
  Configuration configuration = new Configuration();
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  MessageType fileSchema = MessageTypeParser.parseMessageType(fullSchemaStr);
  MessageType partialSchema = MessageTypeParser.parseMessageType(partialSchemaStr);
  configuration.set(ReadSupport.PARQUET_READ_SCHEMA, partialSchemaStr);

  ReadSupport.ReadContext context = s.init(configuration, keyValueMetaData, fileSchema);
  assertEquals(context.getRequestedSchema(), partialSchema);
}
 
Example 21
Source Project: parquet-mr   Source File: TestColumnIndexFiltering.java    License: Apache License 2.0 5 votes vote down vote up
private List<User> readUsersWithProjection(Filter filter, MessageType schema, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter)
      .set(ReadSupport.PARQUET_READ_SCHEMA, schema.toString()));
}
 
Example 22
Source Project: parquet-mr   Source File: DeprecatedInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      writeJob = new Job(conf, "write");
      TextInputFormat.addInputPath(writeJob, inputPath);
      writeJob.setInputFormatClass(TextInputFormat.class);
      writeJob.setNumReduceTasks(0);
      ExampleOutputFormat.setCompression(writeJob, codec);
      ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
      writeJob.setOutputFormatClass(ExampleOutputFormat.class);
      writeJob.setMapperClass(ReadMapper.class);
      ExampleOutputFormat.setSchema(
              writeJob,
              MessageTypeParser.parseMessageType(
                      writeSchema));
      writeJob.submit();
      waitForJob(writeJob);
    }
    {
      jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
      jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
      jobConf.setInputFormat(MyDeprecatedInputFormat.class);
      MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
      jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
      org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
      jobConf.setMapperClass(DeprecatedWriteMapper.class);
      jobConf.setNumReduceTasks(0);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }
 
Example 23
Source Project: parquet-mr   Source File: ThriftReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  final Configuration configuration = context.getConfiguration();
  final MessageType fileMessageType = context.getFileSchema();
  MessageType requestedProjection = fileMessageType;
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

  FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration);

  if (partialSchemaString != null && projectionFilter != null) {
    throw new ThriftProjectionException(
        String.format("You cannot provide both a partial schema and field projection filter."
                + "Only one of (%s, %s, %s) should be set.",
            PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY));
  }

  //set requestedProjections only when it's specified
  if (partialSchemaString != null) {
    requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString);
  } else if (projectionFilter != null) {
    try {
      initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration);
      requestedProjection =  getProjectedSchema(projectionFilter);
    } catch (ClassNotFoundException e) {
      throw new ThriftProjectionException("can not find thriftClass from configuration", e);
    }
  }

  MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection);
  return new ReadContext(schemaForRead);
}
 
Example 24
Source Project: parquet-mr   Source File: ThriftParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetReader<T> build() throws IOException {
  ReadSupport<T> readSupport;

  if (thriftClass != null) {
    readSupport = new ThriftReadSupport<T>(thriftClass);
  } else {
    readSupport = new ThriftReadSupport<T>();
  }

  return ParquetReader.builder(readSupport, file).withConf(conf).withFilter(filter).build();
}
 
Example 25
@Test
public void testThriftOptionalFieldsWithReadProjectionUsingParquetSchema() throws Exception {
  // test with projection
  Configuration conf = new Configuration();
  final String readProjectionSchema = "message AddressBook {\n" +
          "  optional group persons {\n" +
          "    repeated group persons_tuple {\n" +
          "      required group name {\n" +
          "        optional binary first_name;\n" +
          "        optional binary last_name;\n" +
          "      }\n" +
          "      optional int32 id;\n" +
          "    }\n" +
          "  }\n" +
          "}";
  conf.set(ReadSupport.PARQUET_READ_SCHEMA, readProjectionSchema);
  TBase toWrite = new AddressBook(
          Arrays.asList(
                  new Person(
                          new Name("Bob", "Roberts"),
                          0,
                          "[email protected]",
                          Arrays.asList(new PhoneNumber("1234567890")))));

  TBase toRead = new AddressBook(
          Arrays.asList(
                  new Person(
                          new Name("Bob", "Roberts"),
                          0,
                          null,
                          null)));
  shouldDoProjection(conf, toWrite, toRead, AddressBook.class);
}
 
Example 26
Source Project: parquet-mr   Source File: DataWritableReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}
 
Example 27
Source Project: parquet-mr   Source File: DataWritableReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the hive read support to interpret data from parquet to hive
 *
 * @param configuration // unused
 * @param keyValueMetaData string map of metadata
 * @param fileSchema // unused
 * @param readContext containing the requested schema and the schema of the hive table
 * @return Record Materialize for Hive
 */
@Override
public RecordMaterializer<ArrayWritable> prepareForRead(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema,
        final org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  final Map<String, String> metadata = readContext.getReadSupportMetadata();
  if (metadata == null) {
    throw new IllegalStateException("ReadContext not initialized properly. " +
      "Don't know the Hive Schema.");
  }
  final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
      parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);

  return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
}
 
Example 28
Source Project: parquet-mr   Source File: AvroParquetReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected ReadSupport<T> getReadSupport() {
  if (isReflect) {
    conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
  } else {
    conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, enableCompatibility);
  }
  return new AvroReadSupport<T>(model);
}
 
Example 29
public TextRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat,
                           InputSplit oldSplit,
                           JobConf oldJobConf,
                           Reporter reporter) throws IOException {

    splitLen = oldSplit.getLength();

    try {
        ReadSupport<SimpleGroup> rs = ParquetInputFormat.getReadSupportInstance(oldJobConf);
        realReader = new ParquetRecordReader<>(rs);
        realReader.initialize(((StreamingParquetInputSplitWrapper)oldSplit).realSplit, oldJobConf, reporter);

        oldJobConf.set("map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());
        oldJobConf.set("mapreduce.map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());

        // read once to gain access to key and value objects
        if (realReader.nextKeyValue()) {

          firstRecord = true;
          valueContainer = new Container<>();
          SimpleGroup v = realReader.getCurrentValue();
          valueContainer.set(v);
          ls = groupToStrings(v);
        } else {

          eof = true;
        }
    } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
    }
}
 
Example 30
Source Project: flink   Source File: ParquetRecordReader.java    License: Apache License 2.0 4 votes vote down vote up
public ParquetRecordReader(ReadSupport<T> readSupport, MessageType readSchema, Filter filter) {
	this.filter = checkNotNull(filter, "readSupport");
	this.readSupport = checkNotNull(readSupport, "readSchema");
	this.readSchema = checkNotNull(readSchema, "filter");
}