parquet.schema.Type Java Examples
The following examples show how to use
parquet.schema.Type.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SimpleRecordConverter.java From parquet-tools with Apache License 2.0 | 6 votes |
private Converter createConverter(Type field) { if (field.isPrimitive()) { OriginalType otype = field.getOriginalType(); if (otype != null) { switch (otype) { case MAP: break; case LIST: break; case UTF8: return new StringConverter(field.getName()); case MAP_KEY_VALUE: break; case ENUM: break; } } return new SimplePrimitiveConverter(field.getName()); } return new SimpleRecordConverter(field.asGroupType(), field.getName(), this); }
Example #2
Source File: ExaParquetWriterImpl.java From hadoop-etl-udfs with MIT License | 6 votes |
static private List<Type> typeInfoToParquetTypes(final List<ExaParquetTypeInfo> exaParquetTypeInfos) { List<Type> types = new ArrayList<>(); for (ExaParquetTypeInfo exaType: exaParquetTypeInfos) { if (exaType.length != 0) { types.add(new PrimitiveType( Type.Repetition.valueOf(exaType.typeRepitition), PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName), exaType.length, exaType.name)); } else { types.add(new PrimitiveType( Type.Repetition.valueOf(exaType.typeRepitition), PrimitiveType.PrimitiveTypeName.valueOf(exaType.primitiveTypeName), exaType.name, exaType.originalType == null ? null : OriginalType.valueOf(exaType.originalType))); } } return types; }
Example #3
Source File: TupleWriter.java From hadoop-etl-udfs with MIT License | 6 votes |
private void writeTuple(Tuple tuple, GroupType type) { for (int index = 0; index < type.getFieldCount(); index++) { Type fieldType = type.getType(index); String fieldName = fieldType.getName(); // empty fields have to be omitted if (tuple.isNull(index)) continue; recordConsumer.startField(fieldName, index); if (fieldType.isPrimitive()) { tuple.writePrimitiveValue(recordConsumer, index, (PrimitiveType)fieldType); } else { recordConsumer.startGroup(); writeTuple(tuple.getTuple(index), fieldType.asGroupType()); recordConsumer.endGroup(); } recordConsumer.endField(fieldName, index); } }
Example #4
Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0 | 6 votes |
public String toString(String indent) { StringBuilder result = new StringBuilder(); int i = 0; for (Type field : this.schema.getFields()) { String name = field.getName(); List<Object> values = this.data[i]; for (Object value : values) { result.append(indent).append(name); if (value == null) { result.append(": NULL\n"); } else if (value instanceof Group) { result.append("\n").append(((ParquetGroup) value).toString(indent + " ")); } else { result.append(": ").append(value.toString()).append("\n"); } } i++; } return result.toString(); }
Example #5
Source File: PentahoParquetReadSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public ReadContext init( InitContext context ) { String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY ); if ( schemaStr == null ) { throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" ); } ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr ); converter = new ParquetConverter( schema.getFields() ); // get all fields from file's schema MessageType fileSchema = context.getFileSchema(); List<Type> newFields = new ArrayList<>(); // use only required fields for ( IParquetInputField f : schema ) { Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) ); newFields.add( origField ); } if ( newFields.isEmpty() ) { throw new RuntimeException( "Fields should be declared" ); } MessageType newSchema = new MessageType( fileSchema.getName(), newFields ); return new ReadContext( newSchema, new HashMap<>() ); }
Example #6
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 6 votes |
@Override protected Type buildSchema() { JsonElementConverter elementConverter = this.elementConverter; JsonElementConverter keyConverter = getKeyConverter(); GroupType mapGroup = Types.repeatedGroup().addFields(keyConverter.schema(), elementConverter.schema()).named(MAP_KEY) .asGroupType(); String columnName = this.jsonSchema.getColumnName(); switch (optionalOrRequired(this.jsonSchema)) { case OPTIONAL: return Types.optionalGroup().addFields(mapGroup).named(columnName).asGroupType(); case REQUIRED: return Types.requiredGroup().addFields(mapGroup).named(columnName).asGroupType(); default: return null; } }
Example #7
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private Type buildSchema() { JsonArray inputSchema = this.jsonSchema.getDataTypeValues(); List<Type> parquetTypes = new ArrayList<>(); for (JsonElement element : inputSchema) { JsonObject map = (JsonObject) element; JsonSchema elementSchema = new JsonSchema(map); String columnName = elementSchema.getColumnName(); JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false); Type schemaType = converter.schema(); this.converters.put(columnName, converter); parquetTypes.add(schemaType); } String docName = this.jsonSchema.getColumnName(); switch (recordType) { case ROOT: return new MessageType(docName, parquetTypes); case CHILD: return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes); default: throw new RuntimeException("Unsupported Record type"); } }
Example #8
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 6 votes |
@Override Object convertField(JsonElement value) { ParquetGroup r1 = new ParquetGroup((GroupType) schema()); JsonObject inputRecord = value.getAsJsonObject(); for (Map.Entry<String, JsonElement> entry : inputRecord.entrySet()) { String key = entry.getKey(); JsonElementConverter converter = this.converters.get(key); Object convertedValue = converter.convert(entry.getValue()); boolean valueIsNull = convertedValue == null; Type.Repetition repetition = optionalOrRequired(converter.jsonSchema); if (valueIsNull && repetition.equals(OPTIONAL)) { continue; } r1.add(key, convertedValue); } return r1; }
Example #9
Source File: ParaflowPageSourceProvider.java From paraflow with Apache License 2.0 | 5 votes |
private Type getParquetType(ParaflowColumnHandle column, MessageType messageType) { if (messageType.containsField(column.getName())) { return messageType.getType(column.getName()); } // parquet is case-insensitive, all hdfs-columns get converted to lowercase for (Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(column.getName())) { return type; } } return null; }
Example #10
Source File: SimpleRecordConverter.java From parquet-tools with Apache License 2.0 | 5 votes |
public SimpleRecordConverter(GroupType schema, String name, SimpleRecordConverter parent) { this.converters = new Converter[schema.getFieldCount()]; this.parent = parent; this.name = name; int i = 0; for (Type field: schema.getFields()) { converters[i++] = createConverter(field); } }
Example #11
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, Type type, int depth, MessageType container, List<String> cpath) { if (type instanceof GroupType) { showDetails(out, type.asGroupType(), depth, container, cpath); return; } else if (type instanceof PrimitiveType) { showDetails(out, type.asPrimitiveType(), depth, container, cpath); return; } }
Example #12
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, GroupType type, int depth, MessageType container, List<String> cpath) { String name = Strings.repeat(".", depth) + type.getName(); Repetition rep = type.getRepetition(); int fcount = type.getFieldCount(); out.format("%s: %s F:%d%n", name, rep, fcount); cpath.add(type.getName()); for (Type ftype : type.getFields()) { showDetails(out, ftype, depth + 1, container, cpath); } cpath.remove(cpath.size() - 1); }
Example #13
Source File: ParquetConverter.java From pentaho-hadoop-shims with Apache License 2.0 | 5 votes |
public static List<IParquetInputField> buildInputFields( MessageType schema ) { List<IParquetInputField> inputFields = new ArrayList<>(); for ( Type type : schema.getFields() ) { if ( type.isPrimitive() ) { inputFields.add( convertField( type ) ); } } return inputFields; }
Example #14
Source File: PentahoParquetWriteSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 5 votes |
private MessageType createParquetSchema() { List<Type> types = new ArrayList<>(); for ( IParquetOutputField outputField : outputFields ) { types.add( convertToPrimitiveType( outputField ) ); } if ( types.isEmpty() ) { throw new IllegalArgumentException( "Schema should contain at least one field" ); } return new MessageType( "parquet-schema", types ); }
Example #15
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override protected Type buildSchema() { String columnName = this.jsonSchema.getColumnName(); if (this.repeated) { return Types.repeated(BINARY).as(UTF8).named(columnName); } switch (optionalOrRequired(this.jsonSchema)) { case OPTIONAL: return Types.optional(BINARY).as(UTF8).named(columnName); case REQUIRED: return Types.required(BINARY).as(UTF8).named(columnName); default: throw new RuntimeException("Unsupported Repetition type"); } }
Example #16
Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0 | 5 votes |
public void add(int fieldIndex, Primitive value) { Type type = this.schema.getType(fieldIndex); List<Object> list = this.data[fieldIndex]; if (!type.isRepetition(REPEATED) && !list.isEmpty()) { throw new IllegalStateException( "field " + fieldIndex + " (" + type.getName() + ") can not have more than one value: " + list); } else { list.add(value); } }
Example #17
Source File: ParquetMetadataStat.java From rainbow with Apache License 2.0 | 5 votes |
/** * get the field (column) names. * @return */ @Override public List<String> getFieldNames () { List<String> names = new ArrayList<String>(); for (Type type : this.fields) { names.add(type.getName()); } return names; }
Example #18
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override protected Type buildSchema() { List<Type> fields = new ArrayList<>(); fields.add(0, this.elementConverter.schema()); return new GroupType(optionalOrRequired(jsonSchema), this.jsonSchema.getColumnName(), fields); }
Example #19
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override protected Type buildSchema() { return this.elementConverter.schema(); }
Example #20
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
public static Type.Repetition optionalOrRequired(JsonSchema jsonBaseSchema) { return jsonBaseSchema.isNullable() ? OPTIONAL : REQUIRED; }
Example #21
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override public Type schema() { return this.schema; }
Example #22
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override public Type schema() { return this.schema; }
Example #23
Source File: PentahoParquetWriteSupport.java From pentaho-hadoop-shims with Apache License 2.0 | 4 votes |
private PrimitiveType convertToPrimitiveType( IParquetOutputField f ) { Type.Repetition rep = f.getAllowNull() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED; String formatFieldName = f.getFormatFieldName(); switch ( f.getParquetType() ) { case BINARY: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName ); case BOOLEAN: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BOOLEAN, formatFieldName ); case DOUBLE: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.DOUBLE, formatFieldName ); case FLOAT: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.FLOAT, formatFieldName ); case INT_32: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName ); case UTF8: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.BINARY, formatFieldName, OriginalType.UTF8 ); case INT_64: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName, OriginalType.INT_64 ); case INT_96: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT96, formatFieldName ); case DATE: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT32, formatFieldName, OriginalType.DATE ); case DECIMAL: if ( f.getAllowNull() ) { return Types.optional( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL ) .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL ) .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } case DECIMAL_INT_32: if ( f.getAllowNull() ) { return Types.optional( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL ) .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL ) .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } case DECIMAL_INT_64: if ( f.getAllowNull() ) { return Types.optional( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL ) .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL ) .precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } case TIMESTAMP_MILLIS: return new PrimitiveType( rep, PrimitiveType.PrimitiveTypeName.INT64, formatFieldName, OriginalType.TIMESTAMP_MILLIS ); default: throw new RuntimeException( "Unsupported output type: " + f.getParquetType() ); } }
Example #24
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Override public Type schema() { return this.schema; }
Example #25
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 4 votes |
protected Type buildSchema() { return new PrimitiveType(this.repeated ? REPEATED : optionalOrRequired(this.jsonSchema), this.outputType, this.jsonSchema.getColumnName()); }
Example #26
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 4 votes |
public static void showDetails(PrettyPrintWriter out, MessageType type) { List<String> cpath = new ArrayList<String>(); for (Type ftype : type.getFields()) { showDetails(out, ftype, 0, type, cpath); } }
Example #27
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 4 votes |
public static void showDetails(PrettyPrintWriter out, Type type) { showDetails(out, type, 0, null, null); }
Example #28
Source File: ParaflowPageSourceProvider.java From paraflow with Apache License 2.0 | 4 votes |
private Optional<ConnectorPageSource> createParaflowPageSource( Path path, long start, long length, List<ParaflowColumnHandle> columns) { Optional<FileSystem> fileSystemOptional = fsFactory.getFileSystem(); FileSystem fileSystem; ParquetDataSource dataSource; if (fileSystemOptional.isPresent()) { fileSystem = fileSystemOptional.get(); } else { throw new RuntimeException("Could not find filesystem for path " + path); } try { dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length); // default length is file size, which means whole file is a split length = dataSource.getSize(); ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); List<Type> fields = columns.stream() .filter(column -> column.getColType() != ParaflowColumnHandle.ColumnType.NOTVALID) .map(column -> getParquetType(column, fileSchema)) .filter(Objects::nonNull) .collect(Collectors.toList()); MessageType requestedSchema = new MessageType(fileSchema.getName(), fields); List<BlockMetaData> blocks = new ArrayList<>(); for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) { blocks.add(block); } } ParquetReader parquetReader = new ParquetReader( fileSchema, requestedSchema, blocks, dataSource, typeManager); return Optional.of(new ParaflowPageSource( parquetReader, dataSource, fileSchema, requestedSchema, length, columns, typeManager)); } catch (IOException e) { log.error(e); return Optional.empty(); } }
Example #29
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 2 votes |
/** * Returns a {@link Type} parquet schema * @return */ abstract public Type schema();
Example #30
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | votes |
abstract Type buildSchema();