parquet.hadoop.metadata.ParquetMetadata Java Examples

The following examples show how to use parquet.hadoop.metadata.ParquetMetadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: PentahoTwitterInputFormat.java From pentaho-hadoop-shims with Apache License 2.0

7 votes

@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}

Example #2

Source File: LocalParquetEvaluator.java From rainbow with Apache License 2.0

6 votes

public static ParquetMetadata[] getMetadatas (FileStatus[] fileStatuses, Configuration conf) throws IOException
{
    ParquetMetadata[] res = new ParquetMetadata[fileStatuses.length];
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        res[i] = ParquetFileReader.readFooter(conf, fileStatuses[i].getPath(), NO_FILTER);
    }
    return res;
}

Example #3

Source File: ShowMetaCommand.java From parquet-tools with Apache License 2.0

6 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  
  Configuration conf = new Configuration();
  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, new Path(input));

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withAutoCrop()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  MetadataUtils.showDetails(out, metaData);
  out.flushColumns();
}

Example #4

Source File: HdfsOdpsImportJob.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}

Example #5

Source File: MetadataUtils.java From parquet-tools with Apache License 2.0

5 votes

public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) {
  showDetails(out, meta.getFileMetaData());

  long i = 1;
  for (BlockMetaData bmeta : meta.getBlocks()) {
    out.println();
    showDetails(out, bmeta, i++);
  }
}

Example #6

Source File: DumpCommand.java From parquet-tools with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
    MessageType schema = metaData.getFileMetaData().getSchema();

    PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                             .withAutoColumn()
                                             .withAutoCrop()
                                             .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                                             .withColumnPadding(1)
                                             .withMaxBufferedLines(1000000)
                                             .withFlushOnTab()
                                             .build();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}

Example #7

Source File: ParquetMetadataReader.java From paraflow with Apache License 2.0

4 votes

public static ParquetMetadata readFooter(FileSystem fileSystem, Path file)
        throws IOException
{
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        //
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC

        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;

        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);

        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(
                metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
                "Corrupted Parquet file: %s metadata index: %s out of range",
                file,
                metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet(
                            (filePath == null && columnChunk.getFile_path() == null)
                                    || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                            "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(
                            columnPath,
                            messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
                            CompressionCodecName.fromParquet(metaData.codec),
                            readEncodings(metaData.encodings),
                            readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()),
                            metaData.data_page_offset,
                            metaData.dictionary_page_offset,
                            metaData.num_values,
                            metaData.total_compressed_size,
                            metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }

        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    }
}

Example #8

Source File: ParaflowPageSourceProvider.java From paraflow with Apache License 2.0

4 votes

private Optional<ConnectorPageSource> createParaflowPageSource(
        Path path,
        long start,
        long length,
        List<ParaflowColumnHandle> columns)
{
    Optional<FileSystem> fileSystemOptional = fsFactory.getFileSystem();
    FileSystem fileSystem;
    ParquetDataSource dataSource;
    if (fileSystemOptional.isPresent()) {
        fileSystem = fileSystemOptional.get();
    }
    else {
        throw new RuntimeException("Could not find filesystem for path " + path);
    }
    try {
        dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
        // default length is file size, which means whole file is a split
        length = dataSource.getSize();
        ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();

        List<Type> fields = columns.stream()
                .filter(column -> column.getColType() != ParaflowColumnHandle.ColumnType.NOTVALID)
                .map(column -> getParquetType(column, fileSchema))
                .filter(Objects::nonNull)
                .collect(Collectors.toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);

        List<BlockMetaData> blocks = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                blocks.add(block);
            }
        }

        ParquetReader parquetReader = new ParquetReader(
                fileSchema,
                requestedSchema,
                blocks,
                dataSource,
                typeManager);
        return Optional.of(new ParaflowPageSource(
                parquetReader,
                dataSource,
                fileSchema,
                requestedSchema,
                length,
                columns,
                typeManager));
    }
    catch (IOException e) {
        log.error(e);
        return Optional.empty();
    }
}

Example #9

Source File: LocalParquetEvaluator.java From rainbow with Apache License 2.0

4 votes

public static LocalMetrics execute (FileStatus[] fileStatuses, ParquetMetadata[] metadatas, String[] columnNames, Configuration conf) throws IOException
{
    boolean printColumns = true;
    List<ParquetFileReader> readers = new ArrayList<ParquetFileReader>();
    List<Column> columns = new ArrayList<Column>();
    for (int i = 0; i < fileStatuses.length; ++i)
    {
        FileStatus status = fileStatuses[i];
        ParquetMetadata metadata = metadatas[i];

        MessageType schema = metadata.getFileMetaData().getSchema();

        List<ColumnDescriptor> columnDescriptors = new ArrayList<ColumnDescriptor>();

        for (String columnName : columnNames)
        {
            int fieldIndex = schema.getFieldIndex(columnName.toLowerCase());
            ColumnDescriptor descriptor = schema.getColumns().get(fieldIndex);

            columnDescriptors.add(descriptor);

            if (printColumns)
            {
                Column column = new Column();
                column.setIndex(fieldIndex);
                column.setName(schema.getFieldName(column.getIndex()));
                column.setDescriptor(descriptor);
                columns.add(column);
            }
        }
        printColumns = false;

        readers.add(new ParquetFileReader(conf, status.getPath(), metadata.getBlocks(), columnDescriptors));
    }

    long time  = System.currentTimeMillis();
    long rowCount = 0;
    long rowGroupCount = 0;
    long readerCount = readers.size();
    for (ParquetFileReader reader : readers)
    {
        PageReadStore pageReadStore;
        while ((pageReadStore = reader.readNextRowGroup()) != null)
        {
            rowGroupCount ++;
            rowCount += pageReadStore.getRowCount();
        }
        reader.close();
    }
    LocalMetrics metrics = new LocalMetrics(columns, readerCount, rowGroupCount, rowCount, System.currentTimeMillis()-time);
    return metrics;
}

Example #10

Source File: TestReadWriteParquet.java From parquet-examples with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }