org.apache.parquet.hadoop.ParquetOutputFormat Java Examples

The following examples show how to use org.apache.parquet.hadoop.ParquetOutputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CompressionConfigUtil.java    From presto with Apache License 2.0 6 votes vote down vote up
public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec)
{
    boolean compression = compressionCodec != HiveCompressionCodec.NONE;
    config.setBoolean(COMPRESSRESULT.varname, compression);
    config.setBoolean("mapred.output.compress", compression);
    config.setBoolean(FileOutputFormat.COMPRESS, compression);

    // For ORC
    OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name());

    // For RCFile and Text
    if (compressionCodec.getCodec().isPresent()) {
        config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName());
        config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName());
    }
    else {
        config.unset("mapred.output.compression.codec");
        config.unset(FileOutputFormat.COMPRESS_CODEC);
    }

    // For Parquet
    config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name());

    // For SequenceFile
    config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString());
}
 
Example #2
Source File: TestColumnIndexes.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public Path write(Path directory) throws IOException {
  Path file = new Path(directory, "testColumnIndexes_" + this + ".parquet");
  Random random = new Random(seed);
  int recordCount = random.nextInt(MAX_TOTAL_ROWS) + 1;
  List<Supplier<?>> generators = buildGenerators(recordCount, random);
  Configuration conf = new Configuration();
  ParquetOutputFormat.setColumnIndexTruncateLength(conf, columnIndexTruncateLength);
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withType(SCHEMA)
      .withPageRowCountLimit(pageRowCountLimit)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; i++) {
      writer.write(createGroup(generators, random));
    }
  }
  return file;
}
 
Example #3
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 6 votes vote down vote up
private static CompressionCodecName getCodec(JobConf conf) {

        CompressionCodecName codec;

        if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config
            codec = ParquetOutputFormat.getCompression(conf);
        } else if (getCompressOutput(conf)) { // from hadoop config
            // find the right codec
            Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class);
            LOG.info("Compression set through hadoop codec: " + codecClass.getName());
            codec = CompressionCodecName.fromCompressionCodec(codecClass);
        } else {
            codec = CompressionCodecName.UNCOMPRESSED;
        }

        LOG.info("Compression: " + codec.name());
        return codec;
    }
 
Example #4
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
public PentahoApacheOutputFormat( NamedCluster namedCluster ) {
  logger.info( "We are initializing parquet output format" );

  inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();

    if ( namedCluster != null ) {
      // if named cluster is not defined, no need to add cluster resource configs
      BiConsumer<InputStream, String> consumer = ( is, filename ) -> conf.addResource( is, filename );
      ShimConfigsLoader.addConfigsAsResources( namedCluster, consumer );
    }

    job = Job.getInstance( conf );

    job.getConfiguration().set( ParquetOutputFormat.ENABLE_JOB_SUMMARY, "false" );
    ParquetOutputFormat.setEnableDictionary( job, false );
  } );
}
 
Example #5
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setVersion( VERSION version ) throws Exception {
  inClassloader( () -> {
    ParquetProperties.WriterVersion writerVersion;
    switch ( version ) {
      case VERSION_1_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_1_0;
        break;
      case VERSION_2_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
      default:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
    }
    job.getConfiguration().set( ParquetOutputFormat.WRITER_VERSION, writerVersion.toString() );
  } );
}
 
Example #6
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public void setCompression( COMPRESSION comp ) throws Exception {
  inClassloader( () -> {
    CompressionCodecName codec;
    switch ( comp ) {
      case SNAPPY:
        codec = CompressionCodecName.SNAPPY;
        break;
      case GZIP:
        codec = CompressionCodecName.GZIP;
        break;
      case LZO:
        codec = CompressionCodecName.LZO;
        break;
      default:
        codec = CompressionCodecName.UNCOMPRESSED;
        break;
    }
    ParquetOutputFormat.setCompression( job, codec );
  } );
}
 
Example #7
Source File: IcebergFileWriterFactory.java    From presto with Apache License 2.0 5 votes vote down vote up
private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}
 
Example #8
Source File: MapredParquetOutputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper(
    ParquetOutputFormat<ArrayWritable> realOutputFormat,
    JobConf jobConf,
    String finalOutPath,
    Progressable progress
    ) throws IOException {
  return new ParquetRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress);
}
 
Example #9
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
@Override
public void sinkConfInit(FlowProcess<? extends JobConf> fp,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
  ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
}
 
Example #10
Source File: ParquetScroogeScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void sinkConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  ParquetOutputFormat.setWriteSupportClass(jobConf, ScroogeWriteSupport.class);
  ScroogeWriteSupport.setScroogeClass(jobConf, this.config.getKlass());
}
 
Example #11
Source File: CodecConfigTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void shouldUseParquetFlagToSetCodec(String codecNameStr, CompressionCodecName expectedCodec) throws IOException {

    //Test mapreduce API
    Job job = new Job();
    Configuration conf = job.getConfiguration();
    conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);
    TaskAttemptContext task = ContextUtil.newTaskAttemptContext(conf, new TaskAttemptID(new TaskID(new JobID("test", 1), false, 1), 1));
    Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec);

    //Test mapred API
    JobConf jobConf = new JobConf();
    jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);
    Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec);
  }
 
Example #12
Source File: DeprecatedParquetOutputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public RecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,
    FileSystem fs, JobConf conf, String name, Progressable progress) throws IOException {

  CompressionCodecName codec = getCodec(conf);
  String extension = codec.getExtension() + ".parquet";
  Path file = getDefaultWorkFile(conf, name, extension);

  try {
    realWriter = (ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);
  } catch (InterruptedException e) {
    Thread.interrupted();
    throw new IOException(e);
  }
}
 
Example #13
Source File: ParquetTupleScheme.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
@Override
public void sinkConfInit(FlowProcess<JobConf> fp,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
  ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
}
 
Example #14
Source File: ParquetRowDataWriterTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testCompression() throws IOException {
	Configuration conf = new Configuration();
	conf.set(ParquetOutputFormat.COMPRESSION, "GZIP");
	innerTest(conf, true);
	innerTest(conf, false);
}
 
Example #15
Source File: ParquetAppender.java    From tajo with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param workDir The path of the Parquet file to write to.
 */
public ParquetAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta,
                       Path workDir) throws IOException {
  super(conf, taskAttemptId, schema, meta, workDir);
  this.blockSize = Integer.parseInt(
      meta.getProperty(ParquetOutputFormat.BLOCK_SIZE, StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE));
  this.pageSize = Integer.parseInt(
      meta.getProperty(ParquetOutputFormat.PAGE_SIZE, StorageConstants.PARQUET_DEFAULT_PAGE_SIZE));
  this.compressionCodecName = CompressionCodecName.fromConf(
      meta.getProperty(ParquetOutputFormat.COMPRESSION, StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME));
  this.enableDictionary = Boolean.parseBoolean(
      meta.getProperty(ParquetOutputFormat.ENABLE_DICTIONARY, StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED));
  this.validating = Boolean.parseBoolean(
      meta.getProperty(ParquetOutputFormat.VALIDATION, StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED));
}
 
Example #16
Source File: ParquetHdfsFileSink.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected void configure(Job job, KV<Void, IndexedRecord> sample) {
    super.configure(job, sample);
    IndexedRecord record = (IndexedRecord) sample.getValue();
    AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema());
    ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
}
 
Example #17
Source File: ParquetFileWriterFactory.java    From presto with Apache License 2.0 5 votes vote down vote up
private static CompressionCodecName getCompression(JobConf configuration)
{
    String compressionName = configuration.get(ParquetOutputFormat.COMPRESSION);
    if (compressionName == null) {
        return CompressionCodecName.GZIP;
    }
    return CompressionCodecName.valueOf(compressionName);
}
 
Example #18
Source File: ParquetRecordWriterUtil.java    From presto with Apache License 2.0 5 votes vote down vote up
private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}
 
Example #19
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void enableDictionary( boolean useDictionary ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setEnableDictionary( job, useDictionary ) );
}
 
Example #20
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void setRowGroupSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setBlockSize( job, size ) );
}
 
Example #21
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public static void setEnableDictionary(Configuration configuration, boolean enableDictionary) {
    configuration.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary);
}
 
Example #22
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public static void setCompression(Configuration configuration, CompressionCodecName compression) {
    configuration.set(ParquetOutputFormat.COMPRESSION, compression.name());
}
 
Example #23
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void setDataPageSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setPageSize( job, size ) );
}
 
Example #24
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public static void setPageSize(Configuration configuration, int pageSize) {
    configuration.setInt(ParquetOutputFormat.PAGE_SIZE, pageSize);
}
 
Example #25
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public static void setBlockSize(Configuration configuration, int blockSize) {
    configuration.setInt(ParquetOutputFormat.BLOCK_SIZE, blockSize);
}
 
Example #26
Source File: ParquetAsTextOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public static void setWriteSupportClass(Configuration configuration,  Class<?> writeSupportClass) {
    configuration.set(ParquetOutputFormat.WRITE_SUPPORT_CLASS, writeSupportClass.getName());
}
 
Example #27
Source File: PentahoApacheOutputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public void setDictionaryPageSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setDictionaryPageSize( job, size ) );
}
 
Example #28
Source File: ParquetStorer.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public OutputFormat<Void, Tuple> getOutputFormat() throws IOException {
  Schema pigSchema = getSchema();
  return new ParquetOutputFormat<Tuple>(new TupleWriteSupport(pigSchema));
}
 
Example #29
Source File: TestMapredParquetOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
@Test
public void testConstructorWithFormat() {
  new MapredParquetOutputFormat((ParquetOutputFormat<ArrayWritable>) mock(ParquetOutputFormat.class));
}
 
Example #30
Source File: MapredParquetOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public MapredParquetOutputFormat(final OutputFormat<Void, ArrayWritable> mapreduceOutputFormat) {
  realOutputFormat = (ParquetOutputFormat<ArrayWritable>) mapreduceOutputFormat;
}