Java Code Examples for org.apache.parquet.hadoop.ParquetFileWriter

The following examples show how to use org.apache.parquet.hadoop.ParquetFileWriter. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: parquet-mr   Source File: TransCompressionCommand.java    License: Apache License 2.0 7 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}
 
Example 2
Source Project: Bats   Source File: ParquetReaderUtility.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}
 
Example 3
Source Project: Bats   Source File: ParquetFormatPlugin.java    License: Apache License 2.0 6 votes vote down vote up
boolean isDirReadable(DrillFileSystem fs, FileStatus dir) {
  Path p = new Path(dir.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
  try {
    if (fs.exists(p)) {
      return true;
    } else {

      if (metaDataFileExists(fs, dir)) {
        return true;
      }
      List<FileStatus> statuses = DrillFileSystemUtil.listFiles(fs, dir.getPath(), false);
      return !statuses.isEmpty() && super.isFileReadable(fs, statuses.get(0));
    }
  } catch (IOException e) {
    logger.info("Failure while attempting to check for Parquet metadata file.", e);
    return false;
  }
}
 
Example 4
private void write(File file, String avroSchemaPath, List<Map<String, String>> data) throws IOException
{
    Schema schema = new Parser().parse(ResourceUtils.loadResource(avroSchemaPath));
    try (ParquetWriter<GenericRecord> writer = AvroParquetWriter
            .<GenericRecord>builder(new Path(file.toURI()))
            .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
            .withDataModel(GenericData.get())
            .withSchema(schema)
            .build())
    {
        for (Map<String, String> map : data)
        {
            GenericRecord record = new GenericData.Record(schema);
            map.forEach(record::put);
            writer.write(record);
        }
    }
}
 
Example 5
Source Project: kafka-connect-fs   Source File: ParquetFileReaderTest.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
    FileSystem fs = fsConfig.getFs();
    File parquetFile = File.createTempFile("test-", "." + getFileExtension());

    try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
            .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {
        IntStream.range(0, NUM_RECORDS).forEach(index -> {
            GenericRecord datum = new GenericData.Record(readerSchema);
            datum.put(FIELD_INDEX, index);
            String uuid = UUID.randomUUID().toString();
            datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid));
            datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid));
            try {
                fsConfig.offsetsByIndex().put(index, (long) index);
                writer.write(datum);
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        });
    }
    Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName());
    fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
    return path;
}
 
Example 6
Source Project: hudi   Source File: HoodieParquetWriter.java    License: Apache License 2.0 6 votes vote down vote up
public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
    Schema schema, SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
  super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
      ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
      parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
      ParquetWriter.DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
  this.fs =
      (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  // We cannot accurately measure the snappy compressed output file size. We are choosing a
  // conservative 10%
  // TODO - compute this compression ratio dynamically by looking at the bytes written to the
  // stream and the actual file size reported by HDFS
  this.maxFileSize = parquetConfig.getMaxFileSize()
      + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
  this.writeSupport = parquetConfig.getWriteSupport();
  this.instantTime = instantTime;
  this.sparkTaskContextSupplier = sparkTaskContextSupplier;
}
 
Example 7
Source Project: components   Source File: ParquetHdfsFileSink.java    License: Apache License 2.0 6 votes vote down vote up
@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
    List<Path> sourceFiles = new ArrayList<>();
    for (FileStatus sourceStatus : sourceStatuses) {
        sourceFiles.add(sourceStatus.getPath());
    }
    FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
    ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
            ParquetFileWriter.Mode.CREATE);
    writer.start();
    for (Path input : sourceFiles) {
        writer.appendFile(fs.getConf(), input);
    }
    writer.end(mergedMeta.getKeyValueMetaData());
}
 
Example 8
Source Project: parquet-mr   Source File: TransCompressionCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 9
Source Project: parquet-mr   Source File: CompressionConverter.java    License: Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example 10
Source Project: parquet-mr   Source File: CompressionConveterTest.java    License: Apache License 2.0 6 votes vote down vote up
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 11
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
private static void checkMagicBytes(FileStatus status, byte[] data, int offset) throws IOException {
  for(int i =0, v = offset; i < MAGIC_LENGTH; i++, v++){
    if(ParquetFileWriter.MAGIC[i] != data[v]){
      byte[] magic = ArrayUtils.subarray(data, offset, offset + MAGIC_LENGTH);
      throw new IOException(status.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(ParquetFileWriter.MAGIC) + " but found " + Arrays.toString(magic));
    }
  }
}
 
Example 12
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}
 
Example 13
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}
 
Example 14
Source Project: garmadon   Source File: ProtoParquetWriterWithOffset.java    License: Apache License 2.0 5 votes vote down vote up
protected void mergeToFinalPath(Path lastAvailableFinalPath, Path finalPath) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath)) {
        MessageType schema = reader.getFileMetaData().getSchema();
        if (!checkSchemaEquality(schema)) {
            LOGGER.warn("Schema between last available final file ({}) and temp file ({}) are not identical. We can't merge them",
                lastAvailableFinalPath, temporaryHdfsPath);
            moveToFinalPath(temporaryHdfsPath, finalPath);
        } else {
            Path mergedTempFile = new Path(temporaryHdfsPath.toString() + ".merged");

            if (fs.isFile(mergedTempFile)) fs.delete(mergedTempFile, false);

            Map<String, String> existingMetadata = reader.getFileMetaData().getKeyValueMetaData();
            Map<String, String> newMetadata = new HashMap<>(existingMetadata);
            newMetadata.put(LATEST_TIMESTAMP_META_KEY, String.valueOf(latestTimestamp));

            ParquetFileWriter writerPF = new ParquetFileWriter(fs.getConf(), schema, mergedTempFile);
            writerPF.start();
            try (
                ParquetFileReader dest = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath);
                ParquetFileReader temp = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)
            ) {
                dest.appendTo(writerPF);
                temp.appendTo(writerPF);
                writerPF.end(newMetadata);
            }

            moveToFinalPath(mergedTempFile, lastAvailableFinalPath);
            try {
                fs.delete(temporaryHdfsPath, false);
                // This file is in a temp folder that should be deleted at exit so we should not throw exception here
            } catch (IOException ignored) {
            }
        }
    }
}
 
Example 15
Source Project: arvo2parquet   Source File: DataLoad.java    License: MIT License 5 votes vote down vote up
private static void writeToParquet(@Nonnull final Schema schema,
                                   @Nonnull final Path fileToWrite,
                                   @Nonnull final GenericDataRecordSink sink) throws IOException
{
  try (final ParquetWriter<GenericData.Record> writer = createParquetWriterInstance(schema, fileToWrite)) {
    //noinspection StatementWithEmptyBody
    do ; while(sink.accept(writer::write));
    writer.close();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE);
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(writer.getFooter(), out);
    }
  }
}
 
Example 16
Source Project: arvo2parquet   Source File: DataLoad.java    License: MIT License 5 votes vote down vote up
private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}
 
Example 17
Source Project: dremio-oss   Source File: ParquetReaderUtility.java    License: Apache License 2.0 5 votes vote down vote up
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  HashMap<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
  for (SchemaElement se : fileMetaData.getSchema()) {
    schemaElements.put(se.getName(), se);
  }
  return schemaElements;
}
 
Example 18
Source Project: dremio-oss   Source File: SingletonParquetFooterCache.java    License: Apache License 2.0 5 votes vote down vote up
private static void checkMagicBytes(String path, byte[] data, int offset) throws IOException {
  for(int i =0, v = offset; i < MAGIC_LENGTH; i++, v++){
    if(ParquetFileWriter.MAGIC[i] != data[v]){
      byte[] magic = ArrayUtils.subarray(data, offset, offset + MAGIC_LENGTH);
      throw new IOException(path + " is not a Parquet file. expected magic number at tail " + Arrays.toString(ParquetFileWriter.MAGIC) + " but found " + Arrays.toString(magic));
    }
  }
}
 
Example 19
Source Project: dremio-oss   Source File: ParquetRecordWriter.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Helper method to create a new {@link ParquetFileWriter} as impersonated user.
 * @throws IOException
 */
private void initRecordWriter() throws IOException {

  this.path = fs.canonicalizePath(partition.qualified(location, prefix + "_" + index + "." + extension));
  parquetFileWriter = new ParquetFileWriter(OutputFile.of(fs, path), checkNotNull(schema), ParquetFileWriter.Mode.CREATE, DEFAULT_BLOCK_SIZE,
      MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, true);
  parquetFileWriter.start();
}
 
Example 20
Source Project: embulk-output-parquet   Source File: ParquetOutputPlugin.java    License: MIT License 5 votes vote down vote up
private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
{
    //In case of using Frankurt (eu-central-1) with Signature Version 4 Signing Process
    System.setProperty(SDKGlobalConfiguration.ENABLE_S3_SIGV4_SYSTEM_PROPERTY, task.getSignature());

    final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
    final boolean addUTF8 = task.getAddUTF8();

    final Path path = new Path(buildPath(task, processorIndex));
    final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
    final int blockSize = task.getBlockSize();
    final int pageSize = task.getPageSize();
    final Configuration conf = createConfiguration(task.getExtraConfigurations(), task.getConfigFiles());
    final boolean overwrite = task.getOverwrite();

    ParquetWriter<PageReader> writer = null;
    try {
        EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters, addUTF8)
                .withCompressionCodec(codec)
                .withRowGroupSize(blockSize)
                .withPageSize(pageSize)
                .withDictionaryPageSize(pageSize)
                .withConf(conf);

        if (overwrite) {
            builder.withWriteMode(ParquetFileWriter.Mode.OVERWRITE);
        }

        writer = builder.build();
    } catch (IOException e) {
        Throwables.propagate(e);
    }
    return writer;
}
 
Example 21
Source Project: parquet-mr   Source File: PruneColumnsCommand.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  List<String> args = options.getArgList();
  Path inputFile = new Path(args.get(0));
  Path outputFile = new Path(args.get(1));
  List<String> cols = args.subList(2, args.size());

  Set<ColumnPath> prunePaths = convertToColumnPaths(cols);

  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER);
  FileMetaData metaData = pmd.getFileMetaData();
  MessageType schema = metaData.getSchema();
  List<String> paths = new ArrayList<>();
  getPaths(schema, paths, null);

  for (String col : cols) {
    if (!paths.contains(col)) {
      LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName());
    }
  }

  ParquetFileWriter writer = new ParquetFileWriter(conf,
    pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE);

  writer.start();
  writer.appendFile(HadoopInputFile.fromPath(inputFile, conf));
  writer.end(metaData.getKeyValueMetaData());
}
 
Example 22
Source Project: parquet-mr   Source File: MergeCommand.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  // Prepare arguments
  List<String> args = options.getArgList();
  List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
  Path outputFile = new Path(args.get(args.size() - 1));

  // Merge schema and extraMeta
  FileMetaData mergedMeta = mergedMetadata(inputFiles);
  PrintWriter out = new PrintWriter(Main.out, true);

  // Merge data
  ParquetFileWriter writer = new ParquetFileWriter(conf,
          mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
  writer.start();
  boolean tooSmallFilesMerged = false;
  for (Path input: inputFiles) {
    if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
      out.format("Warning: file %s is too small, length: %d\n",
        input,
        input.getFileSystem(conf).getFileStatus(input).getLen());
      tooSmallFilesMerged = true;
    }

    writer.appendFile(HadoopInputFile.fromPath(input, conf));
  }

  if (tooSmallFilesMerged) {
    out.println("Warning: you merged too small files. " +
      "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
      "which usually leads to bad query performance!");
  }
  writer.end(mergedMeta.getKeyValueMetaData());
}
 
Example 23
Source Project: parquet-mr   Source File: PageChecksumDataGenerator.java    License: Apache License 2.0 5 votes vote down vote up
public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}
 
Example 24
Source Project: parquet-mr   Source File: TestFiltersWithMissingColumns.java    License: Apache License 2.0 5 votes vote down vote up
@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}
 
Example 25
Source Project: presto   Source File: ParquetRecordWriterUtil.java    License: Apache License 2.0 4 votes vote down vote up
public static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties, ConnectorSession session)
        throws IOException, ReflectiveOperationException
{
    conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes());
    conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes());

    RecordWriter recordWriter = createParquetWriter(target, conf, properties);

    Object realWriter = REAL_WRITER_FIELD.get(recordWriter);
    Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter);
    ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter);

    return new ExtendedRecordWriter()
    {
        private long length;

        @Override
        public long getWrittenBytes()
        {
            return length;
        }

        @Override
        public void write(Writable value)
                throws IOException
        {
            recordWriter.write(value);
            length = fileWriter.getPos();
        }

        @Override
        public void close(boolean abort)
                throws IOException
        {
            recordWriter.close(abort);
            if (!abort) {
                length = fileWriter.getPos();
            }
        }
    };
}
 
Example 26
Source Project: iceberg   Source File: Parquet.java    License: Apache License 2.0 4 votes vote down vote up
public WriteBuilder overwrite(boolean enabled) {
  this.writeMode = enabled ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE;
  return this;
}
 
Example 27
Source Project: parquet-mr   Source File: MergeCommand.java    License: Apache License 2.0 4 votes vote down vote up
private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException {
  return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData();
}
 
Example 28
Source Project: parquet-mr   Source File: CompressionConverter.java    License: Apache License 2.0 4 votes vote down vote up
private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk,
                          String createdBy, CompressionCodecName codecName) throws IOException {
  CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
  CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec());
  CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName);
  ColumnIndex columnIndex = reader.readColumnIndex(chunk);
  OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);

  reader.setStreamPosition(chunk.getStartingPos());
  DictionaryPage dictionaryPage = null;
  long readValues = 0;
  Statistics statistics = null;
  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  int pageIndex = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    PageHeader pageHeader = reader.readPageHeader();
    int compressedPageSize = pageHeader.getCompressed_page_size();
    byte[] pageLoad;
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        if (dictionaryPage != null) {
          throw new IOException("has more than one dictionary page in column chunk");
        }
        DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad),
                                                 pageHeader.getUncompressed_page_size(),
                                                 dictPageHeader.getNum_values(),
                                                 converter.getEncoding(dictPageHeader.getEncoding())));
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV1.getNum_values();
        if (offsetIndex != null) {
          long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex);
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            toIntWithCheck(rowCount),
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        } else {
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        }
        pageIndex++;
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        int rlLength = headerV2.getRepetition_levels_byte_length();
        BytesInput rlLevels = readBlockAllocate(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        BytesInput dlLevels = readBlockAllocate(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
        pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength);
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV2.getNum_values();
        writer.writeDataPageV2(headerV2.getNum_rows(),
          headerV2.getNum_nulls(),
          headerV2.getNum_values(),
          rlLevels,
          dlLevels,
          converter.getEncoding(headerV2.getEncoding()),
          BytesInput.from(pageLoad),
          rawDataLength,
          statistics);
        pageIndex++;
        break;
      default:
        LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
        break;
    }
  }
}
 
Example 29
Source Project: parquet-mr   Source File: ConvertCSVCommand.java    License: Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 1,
      "CSV path is required.");

  if (header != null) {
    // if a header is given on the command line, don't assume one is in the file
    noHeader = true;
  }

  CSVProperties props = new CSVProperties.Builder()
      .delimiter(delimiter)
      .escape(escape)
      .quote(quote)
      .header(header)
      .hasHeader(!noHeader)
      .linesToSkip(linesToSkip)
      .charset(charsetName)
      .build();

  String source = targets.get(0);

  Schema csvSchema;
  if (avroSchemaFile != null) {
    csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
  } else {
    Set<String> required = ImmutableSet.of();
    if (requiredFields != null) {
      required = ImmutableSet.copyOf(requiredFields);
    }

    String filename = new File(source).getName();
    String recordName;
    if (filename.contains(".")) {
      recordName = filename.substring(0, filename.indexOf("."));
    } else {
      recordName = filename;
    }

    csvSchema = AvroCSV.inferNullableSchema(
        recordName, open(source), props, required);
  }

  long count = 0;
  try (AvroCSVReader<Record> reader = new AvroCSVReader<>(
      open(source), props, csvSchema, Record.class, true)) {
      CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
    try (ParquetWriter<Record> writer = AvroParquetWriter
        .<Record>builder(qualifiedPath(outputPath))
        .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0)
        .withWriteMode(overwrite ?
            ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE)
        .withCompressionCodec(codec)
        .withDictionaryEncoding(true)
        .withDictionaryPageSize(dictionaryPageSize)
        .withPageSize(pageSize)
        .withRowGroupSize(rowGroupSize)
        .withDataModel(GenericData.get())
        .withConf(getConf())
        .withSchema(csvSchema)
        .build()) {
      for (Record record : reader) {
        writer.write(record);
      }
    } catch (RuntimeException e) {
      throw new RuntimeException("Failed on record " + count, e);
    }
  }

  return 0;
}
 
Example 30
Source Project: nifi   Source File: ParquetConfig.java    License: Apache License 2.0 4 votes vote down vote up
public ParquetFileWriter.Mode getWriterMode() {
    return writerMode;
}