parquet.schema.Type Java Examples

The following examples show how to use parquet.schema.Type. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SimpleRecordConverter.java    From parquet-tools with Apache License 2.0 6 votes vote down vote up
private Converter createConverter(Type field) {
  if (field.isPrimitive()) {
    OriginalType otype = field.getOriginalType();
    if (otype != null) {
      switch (otype) {
        case MAP: break;
        case LIST: break;
        case UTF8: return new StringConverter(field.getName());
        case MAP_KEY_VALUE: break;
        case ENUM: break;
      }
    }

    return new SimplePrimitiveConverter(field.getName());
  }

  return new SimpleRecordConverter(field.asGroupType(), field.getName(), this);
}
 
Example #2
Source File: ExaParquetWriterImpl.java    From hadoop-etl-udfs with MIT License 6 votes vote down vote up
static private List<Type> typeInfoToParquetTypes(final List<ExaParquetTypeInfo> exaParquetTypeInfos) {
    List<Type> types = new ArrayList<>();
    for (ExaParquetTypeInfo exaType: exaParquetTypeInfos) {
        if (exaType.length != 0) {
            types.add(new PrimitiveType(
                    Type.Repetition.valueOf(exaType.typeRepitition),
                    PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName),
                    exaType.length,
                    exaType.name));
        } else {
            types.add(new PrimitiveType(
                    Type.Repetition.valueOf(exaType.typeRepitition),
                    PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName),
                    exaType.name,
                    exaType.originalType == null ? null : OriginalType.valueOf(exaType.originalType)));
        }
    }
    return types;
}
 
Example #3
Source File: TupleWriter.java    From hadoop-etl-udfs with MIT License 6 votes vote down vote up
private void writeTuple(Tuple tuple, GroupType type) {
    for (int index = 0; index < type.getFieldCount(); index++) {
        Type fieldType = type.getType(index);
        String fieldName = fieldType.getName();
        // empty fields have to be omitted
        if (tuple.isNull(index))
            continue;
        recordConsumer.startField(fieldName, index);
        if (fieldType.isPrimitive()) {
            tuple.writePrimitiveValue(recordConsumer, index, (PrimitiveType)fieldType);
        }
        else {
            recordConsumer.startGroup();
            writeTuple(tuple.getTuple(index), fieldType.asGroupType());
            recordConsumer.endGroup();
        }
        recordConsumer.endField(fieldName, index);
    }
}
 
Example #4
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
public String toString(String indent) {
  StringBuilder result = new StringBuilder();
  int i = 0;
  for (Type field : this.schema.getFields()) {
    String name = field.getName();
    List<Object> values = this.data[i];
    for (Object value : values) {
      result.append(indent).append(name);
      if (value == null) {
        result.append(": NULL\n");
      } else if (value instanceof Group) {
        result.append("\n").append(((ParquetGroup) value).toString(indent + "  "));
      } else {
        result.append(": ").append(value.toString()).append("\n");
      }
    }
    i++;
  }
  return result.toString();
}
 
Example #5
Source File: PentahoParquetReadSupport.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public ReadContext init( InitContext context ) {
  String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
  if ( schemaStr == null ) {
    throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
  }

  ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
  converter = new ParquetConverter( schema.getFields() );

  // get all fields from file's schema
  MessageType fileSchema = context.getFileSchema();
  List<Type> newFields = new ArrayList<>();
  // use only required fields
  for ( IParquetInputField f : schema ) {
    Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
    newFields.add( origField );
  }
  if ( newFields.isEmpty() ) {
    throw new RuntimeException( "Fields should be declared" );
  }
  MessageType newSchema = new MessageType( fileSchema.getName(), newFields );

  return new ReadContext( newSchema, new HashMap<>() );
}
 
Example #6
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Override
protected Type buildSchema() {
  JsonElementConverter elementConverter = this.elementConverter;
  JsonElementConverter keyConverter = getKeyConverter();
  GroupType mapGroup =
      Types.repeatedGroup().addFields(keyConverter.schema(), elementConverter.schema()).named(MAP_KEY)
          .asGroupType();
  String columnName = this.jsonSchema.getColumnName();
  switch (optionalOrRequired(this.jsonSchema)) {
    case OPTIONAL:
      return Types.optionalGroup().addFields(mapGroup).named(columnName).asGroupType();
    case REQUIRED:
      return Types.requiredGroup().addFields(mapGroup).named(columnName).asGroupType();
    default:
      return null;
  }
}
 
Example #7
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private Type buildSchema() {
  JsonArray inputSchema = this.jsonSchema.getDataTypeValues();
  List<Type> parquetTypes = new ArrayList<>();
  for (JsonElement element : inputSchema) {
    JsonObject map = (JsonObject) element;
    JsonSchema elementSchema = new JsonSchema(map);
    String columnName = elementSchema.getColumnName();
    JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false);
    Type schemaType = converter.schema();
    this.converters.put(columnName, converter);
    parquetTypes.add(schemaType);
  }
  String docName = this.jsonSchema.getColumnName();
  switch (recordType) {
    case ROOT:
      return new MessageType(docName, parquetTypes);
    case CHILD:
      return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes);
    default:
      throw new RuntimeException("Unsupported Record type");
  }
}
 
Example #8
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Override
Object convertField(JsonElement value) {
  ParquetGroup r1 = new ParquetGroup((GroupType) schema());
  JsonObject inputRecord = value.getAsJsonObject();
  for (Map.Entry<String, JsonElement> entry : inputRecord.entrySet()) {
    String key = entry.getKey();
    JsonElementConverter converter = this.converters.get(key);
    Object convertedValue = converter.convert(entry.getValue());
    boolean valueIsNull = convertedValue == null;
    Type.Repetition repetition = optionalOrRequired(converter.jsonSchema);
    if (valueIsNull && repetition.equals(OPTIONAL)) {
      continue;
    }
    r1.add(key, convertedValue);
  }
  return r1;
}
 
Example #9
Source File: ParaflowPageSourceProvider.java    From paraflow with Apache License 2.0 5 votes vote down vote up
private Type getParquetType(ParaflowColumnHandle column, MessageType messageType)
{
    if (messageType.containsField(column.getName())) {
        return messageType.getType(column.getName());
    }
    // parquet is case-insensitive, all hdfs-columns get converted to lowercase
    for (Type type : messageType.getFields()) {
        if (type.getName().equalsIgnoreCase(column.getName())) {
            return type;
        }
    }
    return null;
}
 
Example #10
Source File: SimpleRecordConverter.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
public SimpleRecordConverter(GroupType schema, String name, SimpleRecordConverter parent) {
  this.converters = new Converter[schema.getFieldCount()];
  this.parent = parent;
  this.name = name;

  int i = 0;
  for (Type field: schema.getFields()) {
    converters[i++] = createConverter(field);
  }
}
 
Example #11
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) {
  if (type instanceof GroupType) {
    showDetails(out, type.asGroupType(), depth, container, cpath);
    return;
  } else if (type instanceof PrimitiveType) {
    showDetails(out, type.asPrimitiveType(), depth, container, cpath);
    return;
  }
}
 
Example #12
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  Repetition rep = type.getRepetition();
  int fcount = type.getFieldCount();
  out.format("%s: %s F:%d%n", name, rep, fcount);

  cpath.add(type.getName());
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, depth + 1, container, cpath);
  }
  cpath.remove(cpath.size() - 1);
}
 
Example #13
Source File: ParquetConverter.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
public static List<IParquetInputField> buildInputFields( MessageType schema ) {
  List<IParquetInputField> inputFields = new ArrayList<>();

  for ( Type type : schema.getFields() ) {
    if ( type.isPrimitive() ) {
      inputFields.add( convertField( type ) );
    }
  }

  return inputFields;
}
 
Example #14
Source File: PentahoParquetWriteSupport.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
private MessageType createParquetSchema() {
  List<Type> types = new ArrayList<>();

  for ( IParquetOutputField outputField : outputFields ) {
    types.add( convertToPrimitiveType( outputField ) );
  }

  if ( types.isEmpty() ) {
    throw new IllegalArgumentException( "Schema should contain at least one field" );
  }

  return new MessageType( "parquet-schema", types );
}
 
Example #15
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
protected Type buildSchema() {
  String columnName = this.jsonSchema.getColumnName();
  if (this.repeated) {
    return Types.repeated(BINARY).as(UTF8).named(columnName);
  }
  switch (optionalOrRequired(this.jsonSchema)) {
    case OPTIONAL:
      return Types.optional(BINARY).as(UTF8).named(columnName);
    case REQUIRED:
      return Types.required(BINARY).as(UTF8).named(columnName);
    default:
      throw new RuntimeException("Unsupported Repetition type");
  }
}
 
Example #16
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public void add(int fieldIndex, Primitive value) {
  Type type = this.schema.getType(fieldIndex);
  List<Object> list = this.data[fieldIndex];
  if (!type.isRepetition(REPEATED) && !list.isEmpty()) {
    throw new IllegalStateException(
        "field " + fieldIndex + " (" + type.getName() + ") can not have more than one value: " + list);
  } else {
    list.add(value);
  }
}
 
Example #17
Source File: ParquetMetadataStat.java    From rainbow with Apache License 2.0 5 votes vote down vote up
/**
 * get the field (column) names.
 * @return
 */
@Override
public List<String> getFieldNames ()
{
    List<String> names = new ArrayList<String>();
    for (Type type : this.fields)
    {
        names.add(type.getName());
    }
    return names;
}
 
Example #18
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
protected Type buildSchema() {
  List<Type> fields = new ArrayList<>();
  fields.add(0, this.elementConverter.schema());
  return new GroupType(optionalOrRequired(jsonSchema), this.jsonSchema.getColumnName(), fields);
}
 
Example #19
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
protected Type buildSchema() {
  return this.elementConverter.schema();
}
 
Example #20
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
public static Type.Repetition optionalOrRequired(JsonSchema jsonBaseSchema) {
  return jsonBaseSchema.isNullable() ? OPTIONAL : REQUIRED;
}
 
Example #21
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public Type schema() {
  return this.schema;
}
 
Example #22
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public Type schema() {
  return this.schema;
}
 
Example #23
Source File: PentahoParquetWriteSupport.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
private PrimitiveType convertToPrimitiveType( IParquetOutputField f ) {
  Type.Repetition rep = f.getAllowNull() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED;
  String formatFieldName = f.getFormatFieldName();
  switch ( f.getParquetType() ) {
    case BINARY:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName );
    case BOOLEAN:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BOOLEAN, formatFieldName );
    case DOUBLE:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.DOUBLE, formatFieldName );
    case FLOAT:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.FLOAT, formatFieldName );
    case INT_32:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName );
    case UTF8:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName, OriginalType.UTF8 );
    case INT_64:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName, OriginalType.INT_64 );
    case INT_96:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT96, formatFieldName );
    case DATE:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName, OriginalType.DATE );
    case DECIMAL:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case DECIMAL_INT_32:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case DECIMAL_INT_64:
      if ( f.getAllowNull() ) {
        return Types.optional( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      } else {
        return Types.required( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL )
          .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
      }
    case TIMESTAMP_MILLIS:
      return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName,
        OriginalType.TIMESTAMP_MILLIS );
    default:
      throw new RuntimeException( "Unsupported output type: " + f.getParquetType() );
  }
}
 
Example #24
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public Type schema() {
  return this.schema;
}
 
Example #25
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
protected Type buildSchema() {
  return new PrimitiveType(this.repeated ? REPEATED : optionalOrRequired(this.jsonSchema), this.outputType,
      this.jsonSchema.getColumnName());
}
 
Example #26
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, MessageType type) {
  List<String> cpath = new ArrayList<String>();
  for (Type ftype : type.getFields()) {
    showDetails(out, ftype, 0, type, cpath);
  }
}
 
Example #27
Source File: MetadataUtils.java    From parquet-tools with Apache License 2.0 4 votes vote down vote up
public static void showDetails(PrettyPrintWriter out, Type type) {
  showDetails(out, type, 0, null, null);
}
 
Example #28
Source File: ParaflowPageSourceProvider.java    From paraflow with Apache License 2.0 4 votes vote down vote up
private Optional<ConnectorPageSource> createParaflowPageSource(
        Path path,
        long start,
        long length,
        List<ParaflowColumnHandle> columns)
{
    Optional<FileSystem> fileSystemOptional = fsFactory.getFileSystem();
    FileSystem fileSystem;
    ParquetDataSource dataSource;
    if (fileSystemOptional.isPresent()) {
        fileSystem = fileSystemOptional.get();
    }
    else {
        throw new RuntimeException("Could not find filesystem for path " + path);
    }
    try {
        dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
        // default length is file size, which means whole file is a split
        length = dataSource.getSize();
        ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();

        List<Type> fields = columns.stream()
                .filter(column -> column.getColType() != ParaflowColumnHandle.ColumnType.NOTVALID)
                .map(column -> getParquetType(column, fileSchema))
                .filter(Objects::nonNull)
                .collect(Collectors.toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);

        List<BlockMetaData> blocks = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                blocks.add(block);
            }
        }

        ParquetReader parquetReader = new ParquetReader(
                fileSchema,
                requestedSchema,
                blocks,
                dataSource,
                typeManager);
        return Optional.of(new ParaflowPageSource(
                parquetReader,
                dataSource,
                fileSchema,
                requestedSchema,
                length,
                columns,
                typeManager));
    }
    catch (IOException e) {
        log.error(e);
        return Optional.empty();
    }
}
 
Example #29
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 2 votes vote down vote up
/**
 * Returns a {@link Type} parquet schema
 * @return
 */
abstract public Type schema();
 
Example #30
Source File: JsonElementConversionFactory.java    From incubator-gobblin with Apache License 2.0 votes vote down vote up
abstract Type buildSchema();