Java Code Examples for org.apache.parquet.hadoop.ParquetOutputFormat

The following examples show how to use org.apache.parquet.hadoop.ParquetOutputFormat. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: presto   Source File: CompressionConfigUtil.java    License: Apache License 2.0 6 votes vote down vote up
public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec)
{
    boolean compression = compressionCodec != HiveCompressionCodec.NONE;
    config.setBoolean(COMPRESSRESULT.varname, compression);
    config.setBoolean("mapred.output.compress", compression);
    config.setBoolean(FileOutputFormat.COMPRESS, compression);

    // For ORC
    OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name());

    // For RCFile and Text
    if (compressionCodec.getCodec().isPresent()) {
        config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName());
        config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName());
    }
    else {
        config.unset("mapred.output.compression.codec");
        config.unset(FileOutputFormat.COMPRESS_CODEC);
    }

    // For Parquet
    config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name());

    // For SequenceFile
    config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString());
}
 
Example 2
Source Project: parquet-mr   Source File: TestColumnIndexes.java    License: Apache License 2.0 6 votes vote down vote up
public Path write(Path directory) throws IOException {
  Path file = new Path(directory, "testColumnIndexes_" + this + ".parquet");
  Random random = new Random(seed);
  int recordCount = random.nextInt(MAX_TOTAL_ROWS) + 1;
  List<Supplier<?>> generators = buildGenerators(recordCount, random);
  Configuration conf = new Configuration();
  ParquetOutputFormat.setColumnIndexTruncateLength(conf, columnIndexTruncateLength);
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withType(SCHEMA)
      .withPageRowCountLimit(pageRowCountLimit)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; i++) {
      writer.write(createGroup(generators, random));
    }
  }
  return file;
}
 
Example 3
private static CompressionCodecName getCodec(JobConf conf) {

        CompressionCodecName codec;

        if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config
            codec = ParquetOutputFormat.getCompression(conf);
        } else if (getCompressOutput(conf)) { // from hadoop config
            // find the right codec
            Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class);
            LOG.info("Compression set through hadoop codec: " + codecClass.getName());
            codec = CompressionCodecName.fromCompressionCodec(codecClass);
        } else {
            codec = CompressionCodecName.UNCOMPRESSED;
        }

        LOG.info("Compression: " + codec.name());
        return codec;
    }
 
Example 4
public PentahoApacheOutputFormat( NamedCluster namedCluster ) {
  logger.info( "We are initializing parquet output format" );

  inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();

    if ( namedCluster != null ) {
      // if named cluster is not defined, no need to add cluster resource configs
      BiConsumer<InputStream, String> consumer = ( is, filename ) -> conf.addResource( is, filename );
      ShimConfigsLoader.addConfigsAsResources( namedCluster, consumer );
    }

    job = Job.getInstance( conf );

    job.getConfiguration().set( ParquetOutputFormat.ENABLE_JOB_SUMMARY, "false" );
    ParquetOutputFormat.setEnableDictionary( job, false );
  } );
}
 
Example 5
@Override
public void setVersion( VERSION version ) throws Exception {
  inClassloader( () -> {
    ParquetProperties.WriterVersion writerVersion;
    switch ( version ) {
      case VERSION_1_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_1_0;
        break;
      case VERSION_2_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
      default:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
    }
    job.getConfiguration().set( ParquetOutputFormat.WRITER_VERSION, writerVersion.toString() );
  } );
}
 
Example 6
@Override
public void setCompression( COMPRESSION comp ) throws Exception {
  inClassloader( () -> {
    CompressionCodecName codec;
    switch ( comp ) {
      case SNAPPY:
        codec = CompressionCodecName.SNAPPY;
        break;
      case GZIP:
        codec = CompressionCodecName.GZIP;
        break;
      case LZO:
        codec = CompressionCodecName.LZO;
        break;
      default:
        codec = CompressionCodecName.UNCOMPRESSED;
        break;
    }
    ParquetOutputFormat.setCompression( job, codec );
  } );
}
 
Example 7
Source Project: presto   Source File: IcebergFileWriterFactory.java    License: Apache License 2.0 5 votes vote down vote up
private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}
 
Example 8
Source Project: presto   Source File: ParquetRecordWriterUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}
 
Example 9
Source Project: presto   Source File: ParquetFileWriterFactory.java    License: Apache License 2.0 5 votes vote down vote up
private static CompressionCodecName getCompression(JobConf configuration)
{
    String compressionName = configuration.get(ParquetOutputFormat.COMPRESSION);
    if (compressionName == null) {
        return CompressionCodecName.GZIP;
    }
    return CompressionCodecName.valueOf(compressionName);
}
 
Example 10
Source Project: components   Source File: ParquetHdfsFileSink.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected void configure(Job job, KV<Void, IndexedRecord> sample) {
    super.configure(job, sample);
    IndexedRecord record = (IndexedRecord) sample.getValue();
    AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema());
    ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
}
 
Example 11
Source Project: tajo   Source File: ParquetAppender.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param workDir The path of the Parquet file to write to.
 */
public ParquetAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta,
                       Path workDir) throws IOException {
  super(conf, taskAttemptId, schema, meta, workDir);
  this.blockSize = Integer.parseInt(
      meta.getProperty(ParquetOutputFormat.BLOCK_SIZE, StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE));
  this.pageSize = Integer.parseInt(
      meta.getProperty(ParquetOutputFormat.PAGE_SIZE, StorageConstants.PARQUET_DEFAULT_PAGE_SIZE));
  this.compressionCodecName = CompressionCodecName.fromConf(
      meta.getProperty(ParquetOutputFormat.COMPRESSION, StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME));
  this.enableDictionary = Boolean.parseBoolean(
      meta.getProperty(ParquetOutputFormat.ENABLE_DICTIONARY, StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED));
  this.validating = Boolean.parseBoolean(
      meta.getProperty(ParquetOutputFormat.VALIDATION, StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED));
}
 
Example 12
Source Project: flink   Source File: ParquetRowDataWriterTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testCompression() throws IOException {
	Configuration conf = new Configuration();
	conf.set(ParquetOutputFormat.COMPRESSION, "GZIP");
	innerTest(conf, true);
	innerTest(conf, false);
}
 
Example 13
Source Project: parquet-mr   Source File: ParquetTupleScheme.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
@Override
public void sinkConfInit(FlowProcess<JobConf> fp,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
  ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
}
 
Example 14
Source Project: parquet-mr   Source File: DeprecatedParquetOutputFormat.java    License: Apache License 2.0 5 votes vote down vote up
public RecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,
    FileSystem fs, JobConf conf, String name, Progressable progress) throws IOException {

  CompressionCodecName codec = getCodec(conf);
  String extension = codec.getExtension() + ".parquet";
  Path file = getDefaultWorkFile(conf, name, extension);

  try {
    realWriter = (ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);
  } catch (InterruptedException e) {
    Thread.interrupted();
    throw new IOException(e);
  }
}
 
Example 15
Source Project: parquet-mr   Source File: CodecConfigTest.java    License: Apache License 2.0 5 votes vote down vote up
public void shouldUseParquetFlagToSetCodec(String codecNameStr, CompressionCodecName expectedCodec) throws IOException {

    //Test mapreduce API
    Job job = new Job();
    Configuration conf = job.getConfiguration();
    conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);
    TaskAttemptContext task = ContextUtil.newTaskAttemptContext(conf, new TaskAttemptID(new TaskID(new JobID("test", 1), false, 1), 1));
    Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec);

    //Test mapred API
    JobConf jobConf = new JobConf();
    jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);
    Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec);
  }
 
Example 16
Source Project: parquet-mr   Source File: ParquetScroogeScheme.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void sinkConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  ParquetOutputFormat.setWriteSupportClass(jobConf, ScroogeWriteSupport.class);
  ScroogeWriteSupport.setScroogeClass(jobConf, this.config.getKlass());
}
 
Example 17
Source Project: parquet-mr   Source File: ParquetTupleScheme.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("rawtypes")
@Override
public void sinkConfInit(FlowProcess<? extends JobConf> fp,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
  ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
}
 
Example 18
Source Project: parquet-mr   Source File: MapredParquetOutputFormat.java    License: Apache License 2.0 5 votes vote down vote up
protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper(
    ParquetOutputFormat<ArrayWritable> realOutputFormat,
    JobConf jobConf,
    String finalOutPath,
    Progressable progress
    ) throws IOException {
  return new ParquetRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress);
}
 
Example 19
Source Project: presto   Source File: ParquetRecordWriterUtil.java    License: Apache License 2.0 4 votes vote down vote up
public static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties, ConnectorSession session)
        throws IOException, ReflectiveOperationException
{
    conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes());
    conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes());

    RecordWriter recordWriter = createParquetWriter(target, conf, properties);

    Object realWriter = REAL_WRITER_FIELD.get(recordWriter);
    Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter);
    ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter);

    return new ExtendedRecordWriter()
    {
        private long length;

        @Override
        public long getWrittenBytes()
        {
            return length;
        }

        @Override
        public void write(Writable value)
                throws IOException
        {
            recordWriter.write(value);
            length = fileWriter.getPos();
        }

        @Override
        public void close(boolean abort)
                throws IOException
        {
            recordWriter.close(abort);
            if (!abort) {
                length = fileWriter.getPos();
            }
        }
    };
}
 
Example 20
Source Project: presto   Source File: TestMapredParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public TestMapredParquetOutputFormat(Optional<MessageType> schema, boolean singleLevelArray)
{
    super(new ParquetOutputFormat<>(new TestDataWritableWriteSupport(singleLevelArray)));
    this.schema = requireNonNull(schema, "schema is null");
}
 
Example 21
Source Project: parquet-mr   Source File: DeprecatedParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public static void setWriteSupportClass(Configuration configuration,  Class<?> writeSupportClass) {
  configuration.set(ParquetOutputFormat.WRITE_SUPPORT_CLASS, writeSupportClass.getName());
}
 
Example 22
Source Project: parquet-mr   Source File: DeprecatedParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public static void setBlockSize(Configuration configuration, int blockSize) {
  configuration.setInt(ParquetOutputFormat.BLOCK_SIZE, blockSize);
}
 
Example 23
Source Project: parquet-mr   Source File: DeprecatedParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public static void setPageSize(Configuration configuration, int pageSize) {
  configuration.setInt(ParquetOutputFormat.PAGE_SIZE, pageSize);
}
 
Example 24
Source Project: parquet-mr   Source File: DeprecatedParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public static void setCompression(Configuration configuration, CompressionCodecName compression) {
  configuration.set(ParquetOutputFormat.COMPRESSION, compression.name());
}
 
Example 25
Source Project: parquet-mr   Source File: DeprecatedParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public static void setEnableDictionary(Configuration configuration, boolean enableDictionary) {
  configuration.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary);
}
 
Example 26
Source Project: parquet-mr   Source File: CodecConfig.java    License: Apache License 2.0 4 votes vote down vote up
public static boolean isParquetCompressionSet(Configuration conf) {
  return conf.get(ParquetOutputFormat.COMPRESSION) != null;
}
 
Example 27
Source Project: parquet-mr   Source File: CodecConfig.java    License: Apache License 2.0 4 votes vote down vote up
public static CompressionCodecName getParquetCompressionCodec(Configuration configuration) {
  return CompressionCodecName.fromConf(configuration.get(ParquetOutputFormat.COMPRESSION, UNCOMPRESSED.name()));
}
 
Example 28
Source Project: parquet-mr   Source File: TestInputOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec, Map<String, String> extraConf) throws IOException, ClassNotFoundException, InterruptedException {
  Configuration conf = new Configuration(this.conf);
  for (Map.Entry<String, String> entry : extraConf.entrySet()) {
    conf.set(entry.getKey(), entry.getValue());
  }
  final FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  fileSystem.delete(outputPath, true);
  {
    writeJob = new Job(conf, "write");
    TextInputFormat.addInputPath(writeJob, inputPath);
    writeJob.setInputFormatClass(TextInputFormat.class);
    writeJob.setNumReduceTasks(0);
    ParquetOutputFormat.setCompression(writeJob, codec);
    ParquetOutputFormat.setOutputPath(writeJob, parquetPath);
    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(readMapperClass);

    ParquetOutputFormat.setWriteSupportClass(writeJob, MyWriteSupport.class);
    GroupWriteSupport.setSchema(
            MessageTypeParser.parseMessageType(writeSchema),
            writeJob.getConfiguration());
    writeJob.submit();
    waitForJob(writeJob);
  }
  {
    conf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
    readJob = new Job(conf, "read");

    readJob.setInputFormatClass(ParquetInputFormat.class);
    ParquetInputFormat.setReadSupportClass(readJob, MyReadSupport.class);

    ParquetInputFormat.setInputPaths(readJob, parquetPath);
    readJob.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(readJob, outputPath);
    readJob.setMapperClass(writeMapperClass);
    readJob.setNumReduceTasks(0);
    readJob.submit();
    waitForJob(readJob);
  }
}
 
Example 29
Source Project: parquet-mr   Source File: ParquetThriftStorer.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public OutputFormat<Void, Tuple> getOutputFormat() throws IOException {
  return new ParquetOutputFormat<Tuple>(new TupleToThriftWriteSupport(className));
}
 
Example 30
Source Project: parquet-mr   Source File: MapredParquetOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
public MapredParquetOutputFormat() {
  realOutputFormat = new ParquetOutputFormat<ArrayWritable>(new DataWritableWriteSupport());
}