parquet.schema.MessageType Java Examples

The following examples show how to use parquet.schema.MessageType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoTwitterInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 7 votes vote down vote up
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example #2
Source File: ExaParquetWriterImpl.java    From hadoop-etl-udfs with MIT License 7 votes vote down vote up
private ExaParquetWriterImpl(final MessageType schema,
                             final int numColumns,
                             final Configuration conf,
                             final Path path,
                             final String compressionType,
                             final ExaIterator exa,
                             final int firstColumnIndex,
                             final List<Integer> dynamicPartitionExaColNums) throws Exception {
    System.out.println("Path: " + path.toString());
    System.out.println("Parquet schema:\n" + schema);

    TupleWriteSupport.setSchema(schema, conf);
    this.writer = new ParquetWriter<>(path,
            new TupleWriteSupport(),
            CompressionCodecName.fromConf(compressionType),
            ParquetWriter.DEFAULT_BLOCK_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_PAGE_SIZE,
            ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
            ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
            conf);

    // Create Tuple object with ExaIterator reference.
    this.row = new Tuple(exa, numColumns, firstColumnIndex, dynamicPartitionExaColNums);
}
 
Example #3
Source File: PentahoParquetReadSupport.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public ReadContext init( InitContext context ) {
  String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
  if ( schemaStr == null ) {
    throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
  }

  ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
  converter = new ParquetConverter( schema.getFields() );

  // get all fields from file's schema
  MessageType fileSchema = context.getFileSchema();
  List<Type> newFields = new ArrayList<>();
  // use only required fields
  for ( IParquetInputField f : schema ) {
    Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
    newFields.add( origField );
  }
  if ( newFields.isEmpty() ) {
    throw new RuntimeException( "Fields should be declared" );
  }
  MessageType newSchema = new MessageType( fileSchema.getName(), newFields );

  return new ReadContext( newSchema, new HashMap<>() );
}
 
Example #4
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[cpath.size()]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example #5
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private Type buildSchema() {
  JsonArray inputSchema = this.jsonSchema.getDataTypeValues();
  List<Type> parquetTypes = new ArrayList<>();
  for (JsonElement element : inputSchema) {
    JsonObject map = (JsonObject) element;
    JsonSchema elementSchema = new JsonSchema(map);
    String columnName = elementSchema.getColumnName();
    JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false);
    Type schemaType = converter.schema();
    this.converters.put(columnName, converter);
    parquetTypes.add(schemaType);
  }
  String docName = this.jsonSchema.getColumnName();
  switch (recordType) {
    case ROOT:
      return new MessageType(docName, parquetTypes);
    case CHILD:
      return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes);
    default:
      throw new RuntimeException("Unsupported Record type");
  }
}
 
Example #6
Source File: ExaParquetWriterImpl.java    From hadoop-etl-udfs with MIT License 6 votes vote down vote up
public ExaParquetWriterImpl(final List<ExaParquetTypeInfo> schemaTypes,
                            final Configuration conf,
                            final Path path,
                            final String compressionType,
                            final ExaIterator exa,
                            final int firstColumnIndex,
                            final List<Integer> dynamicPartitionExaColNums) throws Exception {
    // Use the schemaTypes provided since HCat table metadata isn't available.
    // This should normally only be used for testing.
    this(new MessageType("hive_schema", ExaParquetWriterImpl.typeInfoToParquetTypes(schemaTypes)),
            schemaTypes.size(),
            conf,
            path,
            compressionType,
            exa,
            firstColumnIndex,
            dynamicPartitionExaColNums);
}
 
Example #7
Source File: PentahoParquetWriteSupport.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
private MessageType createParquetSchema() {
  List<Type> types = new ArrayList<>();

  for ( IParquetOutputField outputField : outputFields ) {
    types.add( convertToPrimitiveType( outputField ) );
  }

  if ( types.isEmpty() ) {
    throw new IllegalArgumentException( "Schema should contain at least one field" );
  }

  return new MessageType( "parquet-schema", types );
}
 
Example #8
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  Repetition rep = type.getRepetition();
  int fcount = type.getFieldCount();
  out.format("%s: %s F:%d%n", name, rep, fcount);

  cpath.add(type.getName());
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, depth + 1, container, cpath);
  }
  cpath.remove(cpath.size() - 1);
}
 
Example #9
Source File: JsonIntermediateToParquetGroupConverterTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Test(expectedExceptions = RuntimeException.class, expectedExceptionsMessageRegExp = "Symbol .* does not belong to set \\[.*?\\]")
public void testEnumTypeBelongsToEnumSet()
    throws Exception {
  JsonObject test = deepCopy(testCases.get("enum").getAsJsonObject(), JsonObject.class);
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  JsonObject jsonRecord = test.get("record").getAsJsonObject();
  jsonRecord.addProperty("some_enum", "HELL");

  parquetConverter.convertRecord(schema, jsonRecord, workUnit).iterator().next();
}
 
Example #10
Source File: JsonIntermediateToParquetGroupConverterTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void testCase(String testCaseName)
    throws SchemaConversionException, DataConversionException {
  JsonObject test = testCases.get(testCaseName).getAsJsonObject();
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  Group record =
      parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
  assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
  assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}
 
Example #11
Source File: ParquetReader.java    From paraflow with Apache License 2.0 5 votes vote down vote up
public ParquetReader(MessageType fileSchema,
                     MessageType requestedSchema,
                     List<BlockMetaData> blocks,
                     ParquetDataSource dataSource,
                     TypeManager typeManager)
{
    this.fileSchema = fileSchema;
    this.requestedSchema = requestedSchema;
    this.blocks = blocks;
    this.dataSource = dataSource;
    this.typeManager = typeManager;
    initializeColumnReaders();
}
 
Example #12
Source File: JsonIntermediateToParquetGroupConverter.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
public MessageType convertSchema(JsonArray inputSchema, WorkUnitState workUnit)
    throws SchemaConversionException {
  String fieldName = workUnit.getExtract().getTable();
  JsonSchema jsonSchema = new JsonSchema(inputSchema);
  jsonSchema.setColumnName(fieldName);
  recordConverter = new RecordConverter(jsonSchema, ROOT);
  return (MessageType) recordConverter.schema();
}
 
Example #13
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) {
  if (type instanceof GroupType) {
    showDetails(out, type.asGroupType(), depth, container, cpath);
    return;
  } else if (type instanceof PrimitiveType) {
    showDetails(out, type.asPrimitiveType(), depth, container, cpath);
    return;
  }
}
 
Example #14
Source File: DumpCommand.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath);
    MessageType schema = metaData.getFileMetaData().getSchema();

    PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                             .withAutoColumn()
                                             .withAutoCrop()
                                             .withWhitespaceHandler(WhiteSpaceHandler.ELIMINATE_NEWLINES)
                                             .withColumnPadding(1)
                                             .withMaxBufferedLines(1000000)
                                             .withFlushOnTab()
                                             .build();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example #15
Source File: HdfsOdpsImportJob.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}
 
Example #16
Source File: ParquetConverter.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
public static List<IParquetInputField> buildInputFields( MessageType schema ) {
  List<IParquetInputField> inputFields = new ArrayList<>();

  for ( Type type : schema.getFields() ) {
    if ( type.isPrimitive() ) {
      inputFields.add( convertField( type ) );
    }
  }

  return inputFields;
}
 
Example #17
Source File: ParaflowPageSource.java    From paraflow with Apache License 2.0 5 votes vote down vote up
public int getFieldIndex(MessageType fileSchema, String name)
{
    try {
        return fileSchema.getFieldIndex(name);
    }
    catch (InvalidRecordException e) {
        for (parquet.schema.Type type : fileSchema.getFields()) {
            if (type.getName().equalsIgnoreCase(name)) {
                return fileSchema.getFieldIndex(type.getName());
            }
        }
        return -1;
    }
}
 
Example #18
Source File: ParaflowPageSource.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private parquet.schema.Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
    if (messageType.containsField(column.getName())) {
        return messageType.getType(column.getName());
    }
    // parquet is case-insensitive, all hdfs-columns get converted to lowercase
    for (parquet.schema.Type type : messageType.getFields()) {
        if (type.getName().equalsIgnoreCase(column.getName())) {
            return type;
        }
    }
    return null;
}
 
Example #19
Source File: ParaflowPageSource.java    From paraflow with Apache License 2.0 5 votes vote down vote up
public ParaflowPageSource(
        ParquetReader parquetReader,
        ParquetDataSource dataSource,
        MessageType fileSchema,
        MessageType requestedSchema,
        long totalBytes,
        List<ParaflowColumnHandle> columns,
        TypeManager typeManager)
{
    checkArgument(totalBytes >= 0, "totalBytes is negative");

    this.parquetReader = requireNonNull(parquetReader, "parquetReader is null");
    this.dataSource = requireNonNull(dataSource, "dataSource is null");
    this.fileSchema = requireNonNull(fileSchema, "fileSchema is null");
    this.requestedSchema = requireNonNull(requestedSchema, "requestedSchema is null");
    this.totalBytes = totalBytes;

    this.columnSize = columns.size();
    this.constantBlocks = new Block[columnSize];
    ImmutableList.Builder<String> namesBuilder = ImmutableList.builder();
    ImmutableList.Builder<Type> typesBuilder = ImmutableList.builder();
    for (int columnIndex = 0; columnIndex < columnSize; columnIndex++) {
        ParaflowColumnHandle column = columns.get(columnIndex);
        String name = column.getName();
        Type type = typeManager.getType(column.getType().getTypeSignature());

        namesBuilder.add(name);
        typesBuilder.add(type);

        if (getParquetType(column, fileSchema) == null) {
            constantBlocks[columnIndex] = RunLengthEncodedBlock.create(type, null, MAX_VECTOR_LENGTH);
        }
    }
    columnNames = namesBuilder.build();
    types = typesBuilder.build();
}
 
Example #20
Source File: ParaflowPageSourceProvider.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
    if (messageType.containsField(column.getName())) {
        return messageType.getType(column.getName());
    }
    // parquet is case-insensitive, all hdfs-columns get converted to lowercase
    for (Type type : messageType.getFields()) {
        if (type.getName().equalsIgnoreCase(column.getName())) {
            return type;
        }
    }
    return null;
}
 
Example #21
Source File: ParquetMetadataReader.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private static MessageType readParquetSchema(List<SchemaElement> schema)
{
    Iterator<SchemaElement> schemaIterator = schema.iterator();
    SchemaElement rootSchema = schemaIterator.next();
    Types.MessageTypeBuilder builder = Types.buildMessage();
    readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
    return builder.named(rootSchema.name);
}
 
Example #22
Source File: ParquetDataWriterBuilder.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
    throws IOException {

  CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
  ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
      .fromString(writerConfiguration.getWriterVersion());

  Configuration conf = new Configuration();
  ParquetWriter versionSpecificWriter = null;
  switch (writerConfiguration.getRecordFormat()) {
    case GROUP: {
      GroupWriteSupport.setSchema((MessageType) this.schema, conf);
      WriteSupport support = new GroupWriteSupport();
      versionSpecificWriter = new ParquetWriter<Group>(
          writerConfiguration.getAbsoluteStagingFile(),
          support,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.getDictPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate(),
          writerVersion,
          conf);
      break;
    }
    case AVRO:  {
      versionSpecificWriter = new AvroParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Schema) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          conf);
      break;
    }
    case PROTOBUF: {
      versionSpecificWriter = new ProtoParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Class<? extends Message>) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate());
      break;
    }
    default: throw new RuntimeException("Record format not supported");
  }
  ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;

  return new ParquetWriterShim() {
    @Override
    public void write(Object record)
        throws IOException {
      finalVersionSpecificWriter.write(record);
    }

    @Override
    public void close()
        throws IOException {
      finalVersionSpecificWriter.close();
    }
  };
}
 
Example #23
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, MessageType type) {
  List<String> cpath = new ArrayList<String>();
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, 0, type, cpath);
  }
}
 
Example #24
Source File: SimpleRecordMaterializer.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
public SimpleRecordMaterializer(MessageType schema) {
  this.root = new SimpleRecordConverter(schema);
}
 
Example #25
Source File: SimpleReadSupport.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<SimpleRecord> prepareForRead(Configuration conf, Map<String,String> metaData, MessageType schema, ReadContext context) {
  return new SimpleRecordMaterializer(schema);
}
 
Example #26
Source File: TestReadWriteParquet.java    From parquet-examples with Apache License 2.0 4 votes vote down vote up
public int run(String[] args) throws Exception {
if(args.length < 2) {
    LOG.error("Usage: " + getClass().getName() + " INPUTFILE OUTPUTFILE [compression]");
    return 1;
}
String inputFile = args[0];
String outputFile = args[1];
String compression = (args.length > 2) ? args[2] : "none";

Path parquetFilePath = null;
// Find a file in case a directory was passed
RemoteIterator<LocatedFileStatus> it = FileSystem.get(getConf()).listFiles(new Path(inputFile), true);
while(it.hasNext()) {
    FileStatus fs = it.next();
    if(fs.isFile()) {
	parquetFilePath = fs.getPath();
	break;
    }
}
if(parquetFilePath == null) {
    LOG.error("No file found for " + inputFile);
    return 1;
}
LOG.info("Getting schema from " + parquetFilePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(getConf(), parquetFilePath);
MessageType schema = readFooter.getFileMetaData().getSchema();
LOG.info(schema);
GroupWriteSupport.setSchema(schema, getConf());

       Job job = new Job(getConf());
       job.setJarByClass(getClass());
       job.setJobName(getClass().getName());
       job.setMapperClass(ReadRequestMap.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(ExampleInputFormat.class);
job.setOutputFormatClass(ExampleOutputFormat.class);

CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
if(compression.equalsIgnoreCase("snappy")) {
    codec = CompressionCodecName.SNAPPY;
} else if(compression.equalsIgnoreCase("gzip")) {
    codec = CompressionCodecName.GZIP;
}
LOG.info("Output compression: " + codec);
ExampleOutputFormat.setCompression(job, codec);

FileInputFormat.setInputPaths(job, new Path(inputFile));
       FileOutputFormat.setOutputPath(job, new Path(outputFile));

       job.waitForCompletion(true);

       return 0;
   }
 
Example #27
Source File: PentahoParquetReadSupport.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<RowMetaAndData> prepareForRead( Configuration configuration,
                                                          Map<String, String> keyValueMetaData,
                                                          MessageType fileSchema, ReadContext readContext ) {
  return new ParquetConverter.MyRecordMaterializer( converter );
}
 
Example #28
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
    MessageType schema, ReadContext context) {
  return new GroupRecordConverter(schema);
}
 
Example #29
Source File: JsonIntermediateToParquetGroupConverter.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public Iterable<Group> convertRecord(MessageType outputSchema, JsonObject inputRecord, WorkUnitState workUnit)
    throws DataConversionException {
  return new SingleRecordIterable<>((Group) recordConverter.convert(inputRecord));
}
 
Example #30
Source File: TupleWriteSupport.java    From hadoop-etl-udfs with MIT License 4 votes vote down vote up
public static MessageType getSchema(Configuration configuration) {
    return MessageTypeParser.parseMessageType(configuration.get(PARQUET_SCHEMA_PROPERTY_NAME));
}