parquet.hadoop.ParquetFileReader Java Examples

The following examples show how to use parquet.hadoop.ParquetFileReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoTwitterInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 7 votes vote down vote up
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example #2
Source File: LocalParquetEvaluator.java    From rainbow with Apache License 2.0 6 votes vote down vote up
public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
    ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
    }
    return res;
}
 
Example #3
Source File: ShowMetaCommand.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  
  Configuration conf = new Configuration();
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withAutoCrop()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  MetadataUtils.showDetails(out, metaData);
  out.flushColumns();
}
 
Example #4
Source File: HdfsOdpsImportJob.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}
 
Example #5
Source File: DumpCommand.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
    MessageType schema = metaData.getFileMetaData().getSchema();

    PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                             .withAutoColumn()
                                             .withAutoCrop()
                                             .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                                             .withColumnPadding(1)
                                             .withMaxBufferedLines(1000000)
                                             .withFlushOnTab()
                                             .build();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example #6
Source File: ParquetFileMetadata.java    From rainbow with Apache License 2.0 4 votes vote down vote up
public ParquetFileMetadata(Configuration conf, Path hdfsFilePath) throws IOException
{
    this.metaData = ParquetFileReader.readFooter(conf, hdfsFilePath, NO_FILTER);
}
 
Example #7
Source File: LocalParquetEvaluator.java    From rainbow with Apache License 2.0 4 votes vote down vote up
public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
    boolean printColumns = true;
    List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
    List<Column> columns = new ArrayList<Column>();
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        FileStatus status = fileStatuses[i];
        ParquetMetadata metadata = metadatas[i];

        MessageType schema = metadata.getFileMetaData().getSchema();

        List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();

        for (String columnName : columnNames)
        {
            int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
            ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);

            columnDescriptors.add(descriptor);

            if (printColumns)
            {
                Column column = new Column();
                column.setIndex(fieldIndex);
                column.setName(schema.getFieldName(column.getIndex()));
                column.setDescriptor(descriptor);
                columns.add(column);
            }
        }
        printColumns = false;

        readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
    }

    long time  = System.currentTimeMillis();
    long rowCount = 0;
    long rowGroupCount = 0;
    long readerCount = readers.size();
    for (ParquetFileReader reader : readers)
    {
        PageReadStore pageReadStore;
        while ((pageReadStore = reader.readNextRowGroup()) != null)
        {
            rowGroupCount ++;
            rowCount += pageReadStore.getRowCount();
        }
        reader.close();
    }
    LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
    return metrics;
}
 
Example #8
Source File: TestReadWriteParquet.java    From parquet-examples with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }