org.apache.parquet.schema.Type Java Examples

The following examples show how to use org.apache.parquet.schema.Type. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public AvroCollectionConverter(ParentValueContainer parent, GroupType type,
                               Schema avroSchema, GenericData model,
                               Class<?> containerClass) {
  this.parent = parent;
  this.avroSchema = avroSchema;
  this.containerClass = containerClass;
  Schema elementSchema = AvroSchemaConverter.getNonNull(avroSchema.getElementType());
  Type repeatedType = type.getType(0);
  // always determine whether the repeated type is the element type by
  // matching it against the element schema.
  if (isElementType(repeatedType, elementSchema)) {
    // the element type is the repeated type (and required)
    converter = newConverter(elementSchema, repeatedType, model, new ParentValueContainer() {
      @Override
      @SuppressWarnings("unchecked")
      public void add(Object value) {
        container.add(value);
      }
    });
  } else {
    // the element is wrapped in a synthetic group and may be optional
    converter = new ElementConverter(repeatedType.asGroupType(), elementSchema, model);
  }
}
 
Example #2
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example #3
Source File: ThriftSchemaConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Returns whether the given type is the element type of a list or is a
 * synthetic group with one field that is the element type. This is
 * determined by checking whether the type can be a synthetic group and by
 * checking whether a potential synthetic group matches the expected
 * ThriftField.
 * <p>
 * This method never guesses because the expected ThriftField is known.
 *
 * @param repeatedType a type that may be the element type
 * @param thriftElement the expected Schema for list elements
 * @return {@code true} if the repeatedType is the element schema
 */
static boolean isListElementType(Type repeatedType,
                                 ThriftField thriftElement) {
  if (repeatedType.isPrimitive() ||
      (repeatedType.asGroupType().getFieldCount() != 1) ||
      (repeatedType.asGroupType().getType(0).isRepetition(REPEATED))) {
    // The repeated type must be the element type because it is an invalid
    // synthetic wrapper. Must be a group with one optional or required field
    return true;
  } else if (thriftElement != null && thriftElement.getType() instanceof StructType) {
    Set<String> fieldNames = new HashSet<String>();
    for (ThriftField field : ((StructType) thriftElement.getType()).getChildren()) {
      fieldNames.add(field.getName());
    }
    // If the repeated type is a subset of the structure of the ThriftField,
    // then it must be the element type.
    return fieldNames.contains(repeatedType.asGroupType().getFieldName(0));
  }
  return false;
}
 
Example #4
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public ElementConverter(String listName, List<TProtocol> listEvents,
                        GroupType repeatedType, ThriftField thriftElement) {
  this.listEvents = listEvents;
  this.elementEvents = new ArrayList<TProtocol>();
  Type elementType = repeatedType.getType(0);
  if (elementType.isRepetition(Type.Repetition.OPTIONAL)) {
    if (ignoreNullElements) {
      LOG.warn("List " + listName +
          " has optional elements: null elements are ignored.");
    } else {
      throw new ParquetDecodingException("Cannot read list " + listName +
          " with optional elements: set " + IGNORE_NULL_LIST_ELEMENTS +
          " to ignore nulls.");
    }
  }
  elementConverter = newConverter(elementEvents, elementType, thriftElement);
}
 
Example #5
Source File: SimpleGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
SimpleGroupConverter(SimpleGroupConverter parent, int index, GroupType schema) {
  this.parent = parent;
  this.index = index;

  converters = new Converter[schema.getFieldCount()];

  for (int i = 0; i < converters.length; i++) {
    final Type type = schema.getType(i);
    if (type.isPrimitive()) {
      converters[i] = new SimplePrimitiveConverter(this, i);
    } else {
      converters[i] = new SimpleGroupConverter(this, i, type.asGroupType());
    }

  }
}
 
Example #6
Source File: GroupWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void writeGroup(Group group, GroupType type) {
  int fieldCount = type.getFieldCount();
  for (int field = 0; field < fieldCount; ++field) {
    int valueCount = group.getFieldRepetitionCount(field);
    if (valueCount > 0) {
      Type fieldType = type.getType(field);
      String fieldName = fieldType.getName();
      recordConsumer.startField(fieldName, field);
      for (int index = 0; index < valueCount; ++index) {
        if (fieldType.isPrimitive()) {
          group.writeValue(field, index, recordConsumer);
        } else {
          recordConsumer.startGroup();
          writeGroup(group.getGroup(field, index), fieldType.asGroupType());
          recordConsumer.endGroup();
        }
      }
      recordConsumer.endField(fieldName, field);
    }
  }
}
 
Example #7
Source File: ProtoWriteSupport.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private FieldWriter createWriter(FieldDescriptor fieldDescriptor, Type type) {

      switch (fieldDescriptor.getJavaType()) {
        case STRING: return new StringWriter() ;
        case MESSAGE: return createMessageWriter(fieldDescriptor, type);
        case INT: return new IntWriter();
        case LONG: return new LongWriter();
        case FLOAT: return new FloatWriter();
        case DOUBLE: return new DoubleWriter();
        case ENUM: return new EnumWriter();
        case BOOLEAN: return new BooleanWriter();
        case BYTE_STRING: return new BinaryWriter();
      }

      return unknownType(fieldDescriptor);//should not be executed, always throws exception.
    }
 
Example #8
Source File: ParquetSchemaUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Prunes columns from a Parquet file schema that was written without field ids.
 * <p>
 * Files that were written without field ids are read assuming that schema evolution preserved
 * column order. Deleting columns was not allowed.
 * <p>
 * The order of columns in the resulting Parquet schema matches the Parquet file.
 *
 * @param fileSchema schema from a Parquet file that does not have field ids.
 * @param expectedSchema expected schema
 * @return a parquet schema pruned using the expected schema
 */
public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) {
  Set<Integer> selectedIds = Sets.newHashSet();

  for (Types.NestedField field : expectedSchema.columns()) {
    selectedIds.add(field.fieldId());
  }

  MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage();

  int ordinal = 1;
  for (Type type : fileSchema.getFields()) {
    if (selectedIds.contains(ordinal)) {
      builder.addField(type.withId(ordinal));
    }
    ordinal += 1;
  }

  return builder.named(fileSchema.getName());
}
 
Example #9
Source File: DataWritableGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public DataWritableGroupConverter(final GroupType selectedGroupType,
    final HiveGroupConverter parent, final int index, final GroupType containingGroupType) {
  this.parent = parent;
  this.index = index;
  final int totalFieldCount = containingGroupType.getFieldCount();
  final int selectedFieldCount = selectedGroupType.getFieldCount();

  currentArr = new Object[totalFieldCount];
  converters = new Converter[selectedFieldCount];

  List<Type> selectedFields = selectedGroupType.getFields();
  for (int i = 0; i < selectedFieldCount; i++) {
    Type subtype = selectedFields.get(i);
    if (containingGroupType.getFields().contains(subtype)) {
      converters[i] = getConverterFromDescription(subtype,
          containingGroupType.getFieldIndex(subtype.getName()), this);
    } else {
      throw new IllegalStateException("Group type [" + containingGroupType +
          "] does not contain requested field: " + subtype);
    }
  }
}
 
Example #10
Source File: HiveClientTest.java    From garmadon with Apache License 2.0 6 votes vote down vote up
@Test
public void createTableWithoutIssue() throws SQLException {
    PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id");

    MessageType schema = new MessageType("fs", appId);

    String table = "fs";
    String location = "file:" + hdfsTemp + "/garmadon_database/fs";
    HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon",
        hdfsTemp + "/garmadon_database");
    hiveClient.createTableIfNotExist(table, schema, location);

    HashMap<String, String> result = getResultHashTableDesc(hiveClient, table);
    assertEquals(location, result.get("Location"));
    assertEquals("EXTERNAL_TABLE", result.get("Table Type").trim());
    assertEquals("string", result.get("day"));
    assertEquals("string", result.get("app_id"));
}
 
Example #11
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
public String toString(String indent) {
  StringBuilder result = new StringBuilder();
  int i = 0;
  for (Type field : this.schema.getFields()) {
    String name = field.getName();
    List<Object> values = this.data[i];
    for (Object value : values) {
      result.append(indent).append(name);
      if (value == null) {
        result.append(": NULL\n");
      } else if (value instanceof Group) {
        result.append("\n").append(((ParquetGroup) value).toString(indent + "  "));
      } else {
        result.append(": ").append(value.toString()).append("\n");
      }
    }
    i++;
  }
  return result.toString();
}
 
Example #12
Source File: LogicalListL2Converter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Override
protected void addChildConverter(String fieldName, OutputMutator mutator, List<Field> arrowSchema, Iterator<SchemaPath> colIterator, Type type, Function<String, String> childNameResolver) {
  final String nameForChild = "inner";
  // Column name to ID mapping creates child entry as 'columnName'.list.element
  // So, we will append 'list.element' so that name to ID matching works correctly
  final String fullChildName = fieldName.concat(".").concat("list.element");
  if (type.isPrimitive()) {
    converters.add( getConverterForType(fullChildName, type.asPrimitiveType()));
  } else {
    final GroupType groupType = type.asGroupType();
    Collection<SchemaPath> c = Lists.newArrayList(colIterator);
    if (arrowSchema != null) {
      converters.add( groupConverterFromArrowSchema(fullChildName, "$data$", groupType, c));
    } else {
      converters.add( defaultGroupConverter(fullChildName, mutator, groupType, c, null));
    }
  }
}
 
Example #13
Source File: AvroWriteSupport.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void writeRecordFields(GroupType schema, Schema avroSchema,
                               Object record) {
  List<Type> fields = schema.getFields();
  List<Schema.Field> avroFields = avroSchema.getFields();
  int index = 0; // parquet ignores Avro nulls, so index may differ
  for (int avroIndex = 0; avroIndex < avroFields.size(); avroIndex++) {
    Schema.Field avroField = avroFields.get(avroIndex);
    if (avroField.schema().getType().equals(Schema.Type.NULL)) {
      continue;
    }
    Type fieldType = fields.get(index);
    Object value = model.getField(record, avroField.name(), avroIndex);
    if (value != null) {
      recordConsumer.startField(fieldType.getName(), index);
      writeValue(fieldType, avroField.schema(), value);
      recordConsumer.endField(fieldType.getName(), index);
    } else if (fieldType.isRepetition(Type.Repetition.REQUIRED)) {
      throw new RuntimeException("Null-value for required field: " + avroField.name());
    }
    index++;
  }
}
 
Example #14
Source File: AvroSchemaConverterLogicalTypesPre19.java    From datacollector with Apache License 2.0 6 votes vote down vote up
private Schema convertFields(String name, List<Type> parquetFields) {
  List<Schema.Field> fields = new ArrayList<Schema.Field>();
  for (Type parquetType : parquetFields) {
    Schema fieldSchema = convertField(parquetType);
    if (parquetType.isRepetition(REPEATED)) {
      throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType);
    } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
      fields.add(new Schema.Field(
          parquetType.getName(), optional(fieldSchema), null, NullNode.getInstance()));
    } else { // REQUIRED
      fields.add(new Schema.Field(parquetType.getName(), fieldSchema, null, null));
    }
  }
  Schema schema = Schema.createRecord(name, null, null, false);
  schema.setFields(fields);
  return schema;
}
 
Example #15
Source File: SimpleGroup.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private StringBuilder appendToString(StringBuilder builder, String indent) {
  int i = 0;
  for (Type field : schema.getFields()) {
    String name = field.getName();
    List<Object> values = data[i];
    ++i;
    if (values != null && !values.isEmpty()) {
      for (Object value : values) {
        builder.append(indent).append(name);
        if (value == null) {
          builder.append(": NULL\n");
        } else if (value instanceof Group) {
          builder.append('\n');
          ((SimpleGroup) value).appendToString(builder, indent + "  ");
        } else {
          builder.append(": ").append(value.toString()).append('\n');
        }
      }
    }
  }
  return builder;
}
 
Example #16
Source File: ParquetValueReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static <T> ParquetValueReader<T> option(Type type, int definitionLevel,
                                               ParquetValueReader<T> reader) {
  if (type.isRepetition(Type.Repetition.OPTIONAL)) {
    return new OptionReader<>(definitionLevel, reader);
  }
  return reader;
}
 
Example #17
Source File: AvroSchemaConverterLogicalTypesPre19.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public MessageType convert(Schema avroSchema) {
  LOG.info("Using customized AvroSchemaConverter utility to convert: " + avroSchema.toString());
  if (!avroSchema.getType().equals(Schema.Type.RECORD)) {
    throw new IllegalArgumentException("Avro schema must be a record.");
  }

  return new MessageType(avroSchema.getFullName(), convertFields(avroSchema.getFields()));
}
 
Example #18
Source File: TajoRecordConverter.java    From tajo with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new TajoRecordConverter.
 *
 * @param parquetSchema The Parquet schema of the projection.
 * @param tajoReadSchema The Tajo schema of the table.
 * @param projectionMap An array mapping the projection column to the column
 *                      index in the table.
 */
public TajoRecordConverter(GroupType parquetSchema, Schema tajoReadSchema,
                           int[] projectionMap) {
  this.parquetSchema = parquetSchema;
  this.tajoReadSchema = tajoReadSchema;
  this.projectionMap = projectionMap;
  this.tupleSize = tajoReadSchema.size();

  // The projectionMap.length does not match parquetSchema.getFieldCount()
  // when the projection contains NULL_TYPE columns. We will skip over the
  // NULL_TYPE columns when we construct the converters and populate the
  // NULL_TYPE columns with NullDatums in start().
  int index = 0;
  this.converters = new Converter[parquetSchema.getFieldCount()];
  for (int i = 0; i < projectionMap.length; ++i) {
    final int projectionIndex = projectionMap[i];
    Column column = tajoReadSchema.getColumn(projectionIndex);
    if (column.getDataType().getType() == TajoDataTypes.Type.NULL_TYPE) {
      continue;
    }
    Type type = parquetSchema.getType(index);
    final int writeIndex = i;
    converters[index] = newConverter(column, type, new ParentValueContainer() {
      @Override
      void add(Object value) {
        TajoRecordConverter.this.set(writeIndex, value);
      }
    });
    ++index;
  }
}
 
Example #19
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Returns the parquet record filter for the given filter string
 *
 * @param filterString      the filter string
 * @param originalFieldsMap a map of field names to types
 * @param schema            the parquet schema
 * @return the parquet record filter for the given filter string
 */
private FilterCompat.Filter getRecordFilter(String filterString, Map<String, Type> originalFieldsMap, MessageType schema) {
    if (StringUtils.isBlank(filterString)) {
        return FilterCompat.NOOP;
    }

    ParquetRecordFilterBuilder filterBuilder = new ParquetRecordFilterBuilder(
            context.getTupleDescription(), originalFieldsMap);
    TreeVisitor pruner = new ParquetOperatorPrunerAndTransformer(
            context.getTupleDescription(), originalFieldsMap, SUPPORTED_OPERATORS);

    try {
        // Parse the filter string into a expression tree Node
        Node root = new FilterParser().parse(filterString);
        // Prune the parsed tree with valid supported operators and then
        // traverse the pruned tree with the ParquetRecordFilterBuilder to
        // produce a record filter for parquet
        TRAVERSER.traverse(root, pruner, filterBuilder);
        return filterBuilder.getRecordFilter();
    } catch (Exception e) {
        LOG.error(String.format("%s-%d: %s--%s Unable to generate Parquet Record Filter for filter",
                context.getTransactionId(),
                context.getSegmentId(),
                context.getDataSource(),
                context.getFilterString()), e);
        return FilterCompat.NOOP;
    }
}
 
Example #20
Source File: ParquetTypeVisitor.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static <T> T visitField(Type field, ParquetTypeVisitor<T> visitor) {
  visitor.fieldNames.push(field.getName());
  try {
    return visit(field, visitor);
  } finally {
    visitor.fieldNames.pop();
  }
}
 
Example #21
Source File: JsonRecordFormatter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
protected Object formatResults(List<SimpleRecord> values) {
  if (super.typeInfo.getRepetition() == Type.Repetition.REPEATED) {
    List<Object> results = new ArrayList<Object>();
    for (SimpleRecord object : values) {
      results.add(add(object));
    }

    return results;
  } else {
    return add(values.get(SINGLE_VALUE));
  }
}
 
Example #22
Source File: DataWritableWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writeArray(final ArrayWritable array, final GroupType type) {
  if (array == null) {
    return;
  }
  final Writable[] subValues = array.get();
  final int fieldCount = type.getFieldCount();
  for (int field = 0; field < fieldCount; ++field) {
    final Type subType = type.getType(field);
    recordConsumer.startField(subType.getName(), field);
    for (int i = 0; i < subValues.length; ++i) {
      final Writable subValue = subValues[i];
      if (subValue != null) {
        if (subType.isPrimitive()) {
          if (subValue instanceof ArrayWritable) {
            writePrimitive(((ArrayWritable) subValue).get()[field]);// 0 ?
          } else {
            writePrimitive(subValue);
          }
        } else {
          if (!(subValue instanceof ArrayWritable)) {
            throw new RuntimeException("This should be a ArrayWritable: " + subValue);
          } else {
            recordConsumer.startGroup();
            writeData((ArrayWritable) subValue, subType.asGroupType());
            recordConsumer.endGroup();
          }
        }
      }
    }
    recordConsumer.endField(subType.getName(), field);
  }
}
 
Example #23
Source File: PigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Type[] convertTypes(Schema pigSchema) {
  List<FieldSchema> fields = pigSchema.getFields();
  Type[] types = new Type[fields.size()];
  for (int i = 0; i < types.length; i++) {
    types[i] = convert(fields.get(i), i);
  }
  return types;
}
 
Example #24
Source File: HiveClientTest.java    From garmadon with Apache License 2.0 5 votes vote down vote up
@Test(expected = Exception.class)
public void shouldThrowExceptionForUnknownParquetType() throws Exception {
    HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon",
        hdfsTemp + "/garmadon_database");

    PrimitiveType unsupported = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT96, "unsupported");
    hiveClient.inferHiveType(unsupported);
}
 
Example #25
Source File: TestParquetVectorizedReads.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
@Override
public void testNestedStruct() {
  AssertHelpers.assertThrows(
      "Vectorized reads are not supported yet for struct fields",
      UnsupportedOperationException.class,
      "Vectorized reads are not supported yet for struct fields",
      () -> VectorizedSparkParquetReaders.buildReader(
          TypeUtil.assignIncreasingFreshIds(new Schema(required(
              1,
              "struct",
              SUPPORTED_PRIMITIVES))),
          new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)),
          false));
}
 
Example #26
Source File: RowConverter.java    From flink with Apache License 2.0 5 votes vote down vote up
ArrayConverter(Type elementType, Class elementClass, TypeInformation elementTypeInfo,
					ParentDataHolder parentDataHolder, int pos) {
	this.elementClass = elementClass;
	this.parentDataHolder = parentDataHolder;
	this.pos = pos;

	if (elementClass.equals(Row.class)) {
		this.elementConverter = createConverter(elementType, 0, elementTypeInfo, this);
	} else {
		this.elementConverter = new RowConverter.RowPrimitiveConverter(elementType, this, 0);
	}
}
 
Example #27
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private StructConverter(List<TProtocol> events, GroupType parquetSchema, ThriftField field) {
  this.events = events;
  this.name = field.getName();
  this.tStruct = new TStruct(name);
  this.thriftType = (StructType)field.getType();
  this.schemaSize = parquetSchema.getFieldCount();
  this.converters = new Converter[this.schemaSize];
  List<ThriftField> thriftChildren = thriftType.getChildren();
  for (int i = 0; i < schemaSize; i++) {
    Type schemaType = parquetSchema.getType(i);
    String fieldName = schemaType.getName();
    ThriftField matchingThrift = null;
    for (ThriftField childField: thriftChildren) {
      String thriftChildName = childField.getName();
      if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) {
        matchingThrift = childField;
        break;
      }
    }
    if (matchingThrift == null) {
    	// this means the file did not contain that field
      // it will never be populated in this instance
      // other files might populate it
    	continue;
    }
    if (schemaType.isPrimitive()) {
    	converters[i] = new PrimitiveFieldHandler(newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(), matchingThrift, events);
    } else {
    	converters[i] = new GroupFieldhandler(newConverter(events, schemaType, matchingThrift).asGroupConverter(), matchingThrift, events);
    }
  }
}
 
Example #28
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Opens the resource for read.
 *
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForRead() throws IOException {
    file = new Path(context.getDataSource());
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);

    // Read the original schema from the parquet file
    MessageType originalSchema = getSchema(file, fileSplit);
    // Get a map of the column name to Types for the given schema
    Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema);
    // Get the read schema. This is either the full set or a subset (in
    // case of column projection) of the greenplum schema.
    MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema);
    // Get the record filter in case of predicate push-down
    FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema);

    // add column projection
    configuration.set(PARQUET_READ_SCHEMA, readSchema.toString());

    fileReader = ParquetReader.builder(new GroupReadSupport(), file)
            .withConf(configuration)
            // Create reader for a given split, read a range in file
            .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength())
            .withFilter(recordFilter)
            .build();
    context.setMetadata(readSchema);
    return true;
}
 
Example #29
Source File: GenericParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array,
                                  ParquetValueReader<?> elementReader) {
  GroupType repeated = array.getFields().get(0).asGroupType();
  String[] repeatedPath = currentPath();

  int repeatedD = type.getMaxDefinitionLevel(repeatedPath)-1;
  int repeatedR = type.getMaxRepetitionLevel(repeatedPath)-1;

  Type elementType = repeated.getType(0);
  int elementD = type.getMaxDefinitionLevel(path(elementType.getName()))-1;

  return new ListReader<>(repeatedD, repeatedR, option(elementType, elementD, elementReader));
}
 
Example #30
Source File: ParquetSchemaConverter.java    From flink with Apache License 2.0 5 votes vote down vote up
private static TypeInformation<?> convertParquetPrimitiveListToFlinkArray(Type type) {
	// Backward-compatibility element group doesn't exist also allowed
	TypeInformation<?> flinkType = convertParquetTypeToTypeInfo(type);
	if (flinkType.isBasicType()) {
		return BasicArrayTypeInfo.getInfoFor(Array.newInstance(flinkType.getTypeClass(), 0).getClass());
	} else {
		// flinkType here can be either SqlTimeTypeInfo or BasicTypeInfo.BIG_DEC_TYPE_INFO,
		// So it should be converted to ObjectArrayTypeInfo
		return ObjectArrayTypeInfo.getInfoFor(flinkType);
	}
}