Java Code Examples for parquet.hadoop.ParquetFileReader

The following examples show how to use parquet.hadoop.ParquetFileReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example 2
Source Project: parquet-tools   Source File: ShowMetaCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  
  Configuration conf = new Configuration();
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withAutoCrop()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  MetadataUtils.showDetails(out, metaData);
  out.flushColumns();
}
 
Example 3
Source Project: rainbow   Source File: LocalParquetEvaluator.java    License: Apache License 2.0 5 votes vote down vote up
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
    ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
    }
    return res;
}
 
Example 4
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}
 
Example 5
Source Project: parquet-tools   Source File: DumpCommand.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
    MessageType schema = metaData.getFileMetaData().getSchema();

    PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                             .withAutoColumn()
                                             .withAutoCrop()
                                             .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                                             .withColumnPadding(1)
                                             .withMaxBufferedLines(1000000)
                                             .withFlushOnTab()
                                             .build();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example 6
Source Project: rainbow   Source File: ParquetFileMetadata.java    License: Apache License 2.0 4 votes vote down vote up
public ParquetFileMetadata(Configuration conf, Path hdfsFilePath) throws IOException
{
    this.metaData = ParquetFileReader.readFooter(conf, hdfsFilePath, NO_FILTER);
}
 
Example 7
Source Project: rainbow   Source File: LocalParquetEvaluator.java    License: Apache License 2.0 4 votes vote down vote up
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
    boolean printColumns = true;
    List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
    List<Column> columns = new ArrayList<Column>();
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        FileStatus status = fileStatuses[i];
        ParquetMetadata metadata = metadatas[i];

        MessageType schema = metadata.getFileMetaData().getSchema();

        List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();

        for (String columnName : columnNames)
        {
            int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
            ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);

            columnDescriptors.add(descriptor);

            if (printColumns)
            {
                Column column = new Column();
                column.setIndex(fieldIndex);
                column.setName(schema.getFieldName(column.getIndex()));
                column.setDescriptor(descriptor);
                columns.add(column);
            }
        }
        printColumns = false;

        readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
    }

    long time  = System.currentTimeMillis();
    long rowCount = 0;
    long rowGroupCount = 0;
    long readerCount = readers.size();
    for (ParquetFileReader reader : readers)
    {
        PageReadStore pageReadStore;
        while ((pageReadStore = reader.readNextRowGroup()) != null)
        {
            rowGroupCount ++;
            rowCount += pageReadStore.getRowCount();
        }
        reader.close();
    }
    LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
    return metrics;
}
 
Example 8
Source Project: parquet-examples   Source File: TestReadWriteParquet.java    License: Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }