parquet.hadoop.metadata.CompressionCodecName Java Examples

The following examples show how to use parquet.hadoop.metadata.CompressionCodecName. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExaParquetWriterImpl.java    From hadoop-etl-udfs with MIT License 7 votes vote down vote up
private ExaParquetWriterImpl(final MessageType schema,
                             final int numColumns,
                             final Configuration conf,
                             final Path path,
                             final String compressionType,
                             final ExaIterator exa,
                             final int firstColumnIndex,
                             final List<Integer> dynamicPartitionExaColNums) throws Exception {
    System.out.println("Path: " + path.toString());
    System.out.println("Parquet schema:\n" + schema);

    TupleWriteSupport.setSchema(schema, conf);
    this.writer = new ParquetWriter<>(path,
            new TupleWriteSupport(),
            CompressionCodecName.fromConf(compressionType),
            ParquetWriter.DEFAULT_BLOCK_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
            ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
            conf);

    // Create Tuple object with ExaIterator reference.
    this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}
 
Example #2
Source File: ParquetAvroExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static void writeAvro(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	// Set up the Hadoop Input Format
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new AvroParquetOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	AvroParquetOutputFormat.setSchema(job, Person.getClassSchema());
	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example #3
Source File: ParquetThriftExample.java    From parquet-flinktacular with Apache License 2.0 6 votes vote down vote up
public static void writeThrift(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	// Set up the Hadoop Input Format
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	ParquetThriftOutputFormat.setThriftClass(job, Person.class);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example #4
Source File: PentahoTwitterOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setCompression( COMPRESSION comp ) throws Exception {
  inClassloader( () -> {
    CompressionCodecName codec;
    switch ( comp ) {
      case SNAPPY:
        codec = CompressionCodecName.SNAPPY;
        break;
      case GZIP:
        codec = CompressionCodecName.GZIP;
        break;
      case LZO:
        codec = CompressionCodecName.LZO;
        break;
      default:
        codec = CompressionCodecName.UNCOMPRESSED;
        break;
    }
    ParquetOutputFormat.setCompression( job, codec );
  } );
}
 
Example #5
Source File: ParquetProtobufExample.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
public static void writeProtobuf(DataSet<Tuple2<Void, Person>> data, String outputPath) throws IOException {
	Job job = Job.getInstance();

	// Set up Hadoop Output Format
	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ProtoParquetOutputFormat(), job);

	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	ProtoParquetOutputFormat.setProtobufClass(job, Person.class);
	ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetOutputFormat.setEnableDictionary(job, true);

	// Output & Execute
	data.output(hadoopOutputFormat);
}
 
Example #6
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createLineitems(ExecutionEnvironment env) throws IOException {
    DataSet<Tuple2<Void,LineitemTable>> lineitems = getLineitemDataSet(env).map(new LineitemToParquet());

    Job job = Job.getInstance();
    
    HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

    ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/lineitems"));
    ParquetThriftOutputFormat.setThriftClass(job, LineitemTable.class);
    ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ParquetThriftOutputFormat.setCompressOutput(job, true);
    ParquetThriftOutputFormat.setEnableDictionary(job, true);

    lineitems.output(hadoopOutputFormat);
}
 
Example #7
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createOrders(ExecutionEnvironment env) throws IOException {
    DataSet<Tuple2<Void,OrderTable>> orders = getOrdersDataSet(env).map(new OrdersToParquet());

    Job job = Job.getInstance();

    HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

    ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/orders"));
    ParquetThriftOutputFormat.setThriftClass(job, OrderTable.class);
    ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ParquetThriftOutputFormat.setCompressOutput(job, true);
    ParquetThriftOutputFormat.setEnableDictionary(job, true);

    orders.output(hadoopOutputFormat);
}
 
Example #8
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createCustomers(ExecutionEnvironment env) throws IOException {
    DataSet<Tuple2<Void,CustomerTable>> customers = getCustomerDataSet(env).map(new CustomerToParquet());

    Job job = Job.getInstance();

    HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

    ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/cust"));
    ParquetThriftOutputFormat.setThriftClass(job, CustomerTable.class);
    ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ParquetThriftOutputFormat.setCompressOutput(job, true);
    ParquetThriftOutputFormat.setEnableDictionary(job, true);

    customers.output(hadoopOutputFormat);
}
 
Example #9
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createDateDim(ExecutionEnvironment env) throws IOException {
	DataSet<Tuple2<Void, DateDimTable>> datedims = getDateDimDataSet(env).map(new DateDimToParquet());

	Job job = Job.getInstance();

	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/datedim"));
	ParquetThriftOutputFormat.setThriftClass(job, DateDimTable.class);
	ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetThriftOutputFormat.setCompressOutput(job, true);
	ParquetThriftOutputFormat.setEnableDictionary(job, false);

	datedims.output(hadoopOutputFormat);
}
 
Example #10
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createItem(ExecutionEnvironment env) throws IOException {
	DataSet<Tuple2<Void, ItemTable>> items = getItemDataSet(env).map(new ItemToParquet());

	Job job = Job.getInstance();

	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/item"));
	ParquetThriftOutputFormat.setThriftClass(job, ItemTable.class);
	ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetThriftOutputFormat.setCompressOutput(job, true);
	ParquetThriftOutputFormat.setEnableDictionary(job, false);

	items.output(hadoopOutputFormat);
}
 
Example #11
Source File: CSVToParquet.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
private static void createStoreSales(ExecutionEnvironment env) throws IOException {
	DataSet<Tuple2<Void, StoreSalesTable>> storeSales = getStoreSalesDataSet(env).map(new StoreSalesToParquet());

	Job job = Job.getInstance();

	HadoopOutputFormat hadoopOutputFormat = new HadoopOutputFormat(new ParquetThriftOutputFormat(), job);

	ParquetThriftOutputFormat.setOutputPath(job, new Path(outputPath + "/storesales"));
	ParquetThriftOutputFormat.setThriftClass(job, StoreSalesTable.class);
	ParquetThriftOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
	ParquetThriftOutputFormat.setCompressOutput(job, true);
	ParquetThriftOutputFormat.setEnableDictionary(job, false);

	storeSales.output(hadoopOutputFormat);
}
 
Example #12
Source File: ParquetAvroStockWriter.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
/**
 * Write the file.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.IOFileOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  File inputFile = new File(cli.getArgValueAsString(CliCommonOpts.IOFileOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.IOFileOpts.OUTPUT));

  AvroParquetWriter<Stock> writer =
      new AvroParquetWriter<Stock>(outputPath, Stock.SCHEMA$,
          CompressionCodecName.SNAPPY,
          ParquetWriter.DEFAULT_BLOCK_SIZE,
          ParquetWriter.DEFAULT_PAGE_SIZE,
          true);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {
    writer.write(stock);
  }

  writer.close();

  return 0;
}
 
Example #13
Source File: ParquetMetadataReader.java    From paraflow with Apache License 2.0 4 votes vote down vote up
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file)
        throws IOException
{
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        //
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC

        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;

        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);

        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));

        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(
                metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex,
                "Corrupted Parquet file: %s metadata index: %s out of range",
                file,
                metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);

        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet(
                            (filePath == null && columnChunk.getFile_path() == null)
                                    || (filePath != null && filePath.equals(columnChunk.getFile_path())),
                            "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(
                            columnPath,
                            messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(),
                            CompressionCodecName.fromParquet(metaData.codec),
                            readEncodings(metaData.encodings),
                            readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()),
                            metaData.data_page_offset,
                            metaData.dictionary_page_offset,
                            metaData.num_values,
                            metaData.total_compressed_size,
                            metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }

        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    }
}
 
Example #14
Source File: ParquetDataWriterBuilder.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
    throws IOException {

  CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
  ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
      .fromString(writerConfiguration.getWriterVersion());

  Configuration conf = new Configuration();
  ParquetWriter versionSpecificWriter = null;
  switch (writerConfiguration.getRecordFormat()) {
    case GROUP: {
      GroupWriteSupport.setSchema((MessageType) this.schema, conf);
      WriteSupport support = new GroupWriteSupport();
      versionSpecificWriter = new ParquetWriter<Group>(
          writerConfiguration.getAbsoluteStagingFile(),
          support,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.getDictPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate(),
          writerVersion,
          conf);
      break;
    }
    case AVRO:  {
      versionSpecificWriter = new AvroParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Schema) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          conf);
      break;
    }
    case PROTOBUF: {
      versionSpecificWriter = new ProtoParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Class<? extends Message>) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate());
      break;
    }
    default: throw new RuntimeException("Record format not supported");
  }
  ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;

  return new ParquetWriterShim() {
    @Override
    public void write(Object record)
        throws IOException {
      finalVersionSpecificWriter.write(record);
    }

    @Override
    public void close()
        throws IOException {
      finalVersionSpecificWriter.close();
    }
  };
}
 
Example #15
Source File: TestReadWriteParquet.java    From parquet-examples with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }