org.apache.parquet.io.api.Converter Java Examples

The following examples show how to use org.apache.parquet.io.api.Converter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SimpleGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
SimpleGroupConverter(SimpleGroupConverter parent, int index, GroupType schema) {
  this.parent = parent;
  this.index = index;

  converters = new Converter[schema.getFieldCount()];

  for (int i = 0; i < converters.length; i++) {
    final Type type = schema.getType(i);
    if (type.isPrimitive()) {
      converters[i] = new SimplePrimitiveConverter(this, i);
    } else {
      converters[i] = new SimpleGroupConverter(this, i, type.asGroupType());
    }

  }
}
 
Example #2
Source File: HiveGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
protected static Converter getConverterFromDescription(final Type type, final int index,
    final HiveGroupConverter parent) {
  if (type == null) {
    return null;
  }
  if (type.isPrimitive()) {
    return ETypeConverter.getNewConverter(type.asPrimitiveType().getPrimitiveTypeName().javaType,
        index, parent);
  } else {
    if (type.asGroupType().getRepetition() == Repetition.REPEATED) {
      return new ArrayWritableGroupConverter(type.asGroupType(), parent, index);
    } else {
      return new DataWritableGroupConverter(type.asGroupType(), parent, index);
    }
  }
}
 
Example #3
Source File: DataWritableGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public DataWritableGroupConverter(final GroupType selectedGroupType,
    final HiveGroupConverter parent, final int index, final GroupType containingGroupType) {
  this.parent = parent;
  this.index = index;
  final int totalFieldCount = containingGroupType.getFieldCount();
  final int selectedFieldCount = selectedGroupType.getFieldCount();

  currentArr = new Object[totalFieldCount];
  converters = new Converter[selectedFieldCount];

  List<Type> selectedFields = selectedGroupType.getFields();
  for (int i = 0; i < selectedFieldCount; i++) {
    Type subtype = selectedFields.get(i);
    if (containingGroupType.getFields().contains(subtype)) {
      converters[i] = getConverterFromDescription(subtype,
          containingGroupType.getFieldIndex(subtype.getName()), this);
    } else {
      throw new IllegalStateException("Group type [" + containingGroupType +
          "] does not contain requested field: " + subtype);
    }
  }
}
 
Example #4
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private Converter newConverter(List<TProtocol> events, Type type, ThriftField field) {
  switch (field.getType().getType()) {
  case LIST:
    return new ListConverter(events, type.asGroupType(), field);
  case SET:
    return new SetConverter(events, type.asGroupType(), field);
  case MAP:
    return new MapConverter(events, type.asGroupType(), field);
  case STRUCT:
    return new StructConverter(events, type.asGroupType(), field);
  case STRING:
    return new FieldStringConverter(events, field);
  case ENUM:
    return new FieldEnumConverter(events, field);
  default:
    return new FieldPrimitiveConverter(events, field);
  }
}
 
Example #5
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private Converter newScalarConverter(ParentValueContainer pvc, Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {

    JavaType javaType = fieldDescriptor.getJavaType();

    switch (javaType) {
      case STRING: return new ProtoStringConverter(pvc);
      case FLOAT: return new ProtoFloatConverter(pvc);
      case DOUBLE: return new ProtoDoubleConverter(pvc);
      case BOOLEAN: return new ProtoBooleanConverter(pvc);
      case BYTE_STRING: return new ProtoBinaryConverter(pvc);
      case ENUM: return new ProtoEnumConverter(pvc, fieldDescriptor);
      case INT: return new ProtoIntConverter(pvc);
      case LONG: return new ProtoLongConverter(pvc);
      case MESSAGE: {
        Message.Builder subBuilder = parentBuilder.newBuilderForField(fieldDescriptor);
        return new ProtoMessageConverter(pvc, subBuilder, parquetType.asGroupType());
      }
    }

    throw new UnsupportedOperationException(String.format("Cannot convert type: %s" +
            " (Parquet type: %s) ", javaType, parquetType));
  }
 
Example #6
Source File: ParquetGroupConverter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
Converter groupConverterFromArrowSchema(String fieldName, String groupTypeName, GroupType groupType, Collection<SchemaPath> c) {
  final String nameForChild = getNameForChild(fieldName);
  final Field arrowField = Schema.findField(arrowSchema, groupTypeName);
  final ArrowTypeID arrowTypeType = arrowField.getType().getTypeID();
  final List<Field> arrowChildren = arrowField.getChildren();
  if (arrowTypeType == ArrowTypeID.Union) {
    // if it's a union we will add the children directly to the parent
    return new UnionGroupConverter(columnResolver, fieldName, mutator, getWriterProvider(), groupType, c, options, arrowChildren, nameForChild,
        schemaHelper);
  } else if (arrowTypeType == ArrowTypeID.List) {
    // make sure the parquet schema matches the arrow schema and delegate handling the logical list to defaultGroupConverter()
    Preconditions.checkState(groupType.getOriginalType() == OriginalType.LIST, "parquet schema doesn't match the arrow schema for LIST " + nameForChild);
  }

  return defaultGroupConverter(fieldName, mutator, groupType, c, arrowChildren);
}
 
Example #7
Source File: FilteringGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {

  // get the real converter from the delegate
  Converter delegateConverter = Objects.requireNonNull(delegate.getConverter(fieldIndex), "delegate converter cannot be null");

  // determine the indexFieldPath for the converter proxy we're about to make, which is
  // this converter's path + the requested fieldIndex
  List<Integer> newIndexFieldPath = new ArrayList<>(indexFieldPath.size() + 1);
  newIndexFieldPath.addAll(indexFieldPath);
  newIndexFieldPath.add(fieldIndex);

  if (delegateConverter.isPrimitive()) {
    PrimitiveColumnIO columnIO = getColumnIO(newIndexFieldPath);
    ColumnPath columnPath = ColumnPath.get(columnIO.getColumnDescriptor().getPath());
    ValueInspector[] valueInspectors = getValueInspectors(columnPath);
    return new FilteringPrimitiveConverter(delegateConverter.asPrimitiveConverter(), valueInspectors);
  } else {
    return new FilteringGroupConverter(delegateConverter.asGroupConverter(), newIndexFieldPath, valueInspectorsByColumn, columnIOsByIndexFieldPath);
  }

}
 
Example #8
Source File: AvroIndexedRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public AvroUnionConverter(ParentValueContainer parent, Type parquetSchema,
                          Schema avroSchema, GenericData model) {
  this.parent = parent;
  GroupType parquetGroup = parquetSchema.asGroupType();
  this.memberConverters = new Converter[ parquetGroup.getFieldCount()];

  int parquetIndex = 0;
  for (int index = 0; index < avroSchema.getTypes().size(); index++) {
    Schema memberSchema = avroSchema.getTypes().get(index);
    if (!memberSchema.getType().equals(Schema.Type.NULL)) {
      Type memberType = parquetGroup.getType(parquetIndex);
      memberConverters[parquetIndex] = newConverter(memberSchema, memberType, model, new ParentValueContainer() {
        @Override
        public void add(Object value) {
          Preconditions.checkArgument(memberValue==null, "Union is resolving to more than one type");
          memberValue = value;
        }
      });
      parquetIndex++; // Note for nulls the parquetIndex id not increased
    }
  }
}
 
Example #9
Source File: AvroRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public AvroUnionConverter(ParentValueContainer parent, Type parquetSchema,
                          Schema avroSchema, GenericData model) {
  super(parent);
  GroupType parquetGroup = parquetSchema.asGroupType();
  this.memberConverters = new Converter[ parquetGroup.getFieldCount()];

  int parquetIndex = 0;
  for (int index = 0; index < avroSchema.getTypes().size(); index++) {
    Schema memberSchema = avroSchema.getTypes().get(index);
    if (!memberSchema.getType().equals(Schema.Type.NULL)) {
      Type memberType = parquetGroup.getType(parquetIndex);
      memberConverters[parquetIndex] = newConverter(memberSchema, memberType, model, new ParentValueContainer() {
        @Override
        public void add(Object value) {
          Preconditions.checkArgument(
              AvroUnionConverter.this.memberValue == null,
              "Union is resolving to more than one type");
          memberValue = value;
        }
      });
      parquetIndex++; // Note for nulls the parquetIndex id not increased
    }
  }
}
 
Example #10
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex != 0) {
    throw new IllegalArgumentException("lists have only one field. can't reach " + fieldIndex);
  }
  return child;
}
 
Example #11
Source File: ArrayWritableGroupConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ArrayWritableGroupConverter(final GroupType groupType, final HiveGroupConverter parent,
    final int index) {
  this.parent = parent;
  this.index = index;
  int count = groupType.getFieldCount();
  if (count < 1 || count > 2) {
    throw new IllegalStateException("Field count must be either 1 or 2: " + count);
  }
  isMap = count == 2;
  converters = new Converter[count];
  for (int i = 0; i < count; i++) {
    converters[i] = getConverterFromDescription(groupType.getType(i), i, this);
  }
}
 
Example #12
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  switch (fieldIndex) {
  case 0:
    return keyConverter;
  case 1:
    return valueConverter;
  default:
    throw new IllegalArgumentException("only key (0) and value (1) are supported. got " + fieldIndex);
  }
}
 
Example #13
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex != 0) {
    throw new IllegalArgumentException("lists have only one field. can't reach " + fieldIndex);
  }
  return child;
}
 
Example #14
Source File: RowConverter.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
	if (fieldIndex == 0) {
		return keyConverter;
	} else {
		return valueConverter;
	}
}
 
Example #15
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private StructConverter(List<TProtocol> events, GroupType parquetSchema, ThriftField field) {
  this.events = events;
  this.name = field.getName();
  this.tStruct = new TStruct(name);
  this.thriftType = (StructType)field.getType();
  this.schemaSize = parquetSchema.getFieldCount();
  this.converters = new Converter[this.schemaSize];
  List<ThriftField> thriftChildren = thriftType.getChildren();
  for (int i = 0; i < schemaSize; i++) {
    Type schemaType = parquetSchema.getType(i);
    String fieldName = schemaType.getName();
    ThriftField matchingThrift = null;
    for (ThriftField childField: thriftChildren) {
      String thriftChildName = childField.getName();
      if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) {
        matchingThrift = childField;
        break;
      }
    }
    if (matchingThrift == null) {
    	// this means the file did not contain that field
      // it will never be populated in this instance
      // other files might populate it
    	continue;
    }
    if (schemaType.isPrimitive()) {
    	converters[i] = new PrimitiveFieldHandler(newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(), matchingThrift, events);
    } else {
    	converters[i] = new GroupFieldhandler(newConverter(events, schemaType, matchingThrift).asGroupConverter(), matchingThrift, events);
    }
  }
}
 
Example #16
Source File: SimpleRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public SimpleRecordConverter(GroupType schema, String name, SimpleRecordConverter parent) {
  this.converters = new Converter[schema.getFieldCount()];
  this.parent = parent;
  this.name = name;

  int i = 0;
  for (Type field: schema.getFields()) {
    converters[i++] = createConverter(field);
  }
}
 
Example #17
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public TupleConverter(GroupType parquetSchema) {
  int schemaSize = parquetSchema.getFieldCount();

  this.converters = new Converter[schemaSize];
  for (int i = 0; i < schemaSize; i++) {
    Type type = parquetSchema.getType(i);
    converters[i] = newConverter(type, i);
  }
}
 
Example #18
Source File: TajoRecordConverter.java    From tajo with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a new TajoRecordConverter.
 *
 * @param parquetSchema The Parquet schema of the projection.
 * @param tajoReadSchema The Tajo schema of the table.
 * @param projectionMap An array mapping the projection column to the column
 *                      index in the table.
 */
public TajoRecordConverter(GroupType parquetSchema, Schema tajoReadSchema,
                           int[] projectionMap) {
  this.parquetSchema = parquetSchema;
  this.tajoReadSchema = tajoReadSchema;
  this.projectionMap = projectionMap;
  this.tupleSize = tajoReadSchema.size();

  // The projectionMap.length does not match parquetSchema.getFieldCount()
  // when the projection contains NULL_TYPE columns. We will skip over the
  // NULL_TYPE columns when we construct the converters and populate the
  // NULL_TYPE columns with NullDatums in start().
  int index = 0;
  this.converters = new Converter[parquetSchema.getFieldCount()];
  for (int i = 0; i < projectionMap.length; ++i) {
    final int projectionIndex = projectionMap[i];
    Column column = tajoReadSchema.getColumn(projectionIndex);
    if (column.getDataType().getType() == TajoDataTypes.Type.NULL_TYPE) {
      continue;
    }
    Type type = parquetSchema.getType(index);
    final int writeIndex = i;
    converters[index] = newConverter(column, type, new ParentValueContainer() {
      @Override
      void add(Object value) {
        TajoRecordConverter.this.set(writeIndex, value);
      }
    });
    ++index;
  }
}
 
Example #19
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Converter newConverter(Type type, int i) {
  if(!type.isPrimitive()) {
    throw new IllegalArgumentException("cascading can only build tuples from primitive types");
  } else {
    return new TuplePrimitiveConverter(this, i);
  }
}
 
Example #20
Source File: RowConverter.java    From flink with Apache License 2.0 5 votes vote down vote up
public RowConverter(GroupType schema, TypeInformation<?> typeInfo, ParentDataHolder parent, int pos) {
	this.typeInfo = typeInfo;
	this.parentDataHolder = parent;
	this.posInParentRow = pos;
	this.converters = new Converter[schema.getFieldCount()];

	int i = 0;
	if (typeInfo.getArity() >= 1 && (typeInfo instanceof CompositeType)) {
		for (Type field : schema.getFields()) {
			converters[i] = createConverter(field, i, ((CompositeType<?>) typeInfo).getTypeAt(i), this);
			i++;
		}
	}
}
 
Example #21
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ProtoMessageConverter(ParentValueContainer pvc, Message.Builder builder, GroupType parquetSchema) {

    int schemaSize = parquetSchema.getFieldCount();
    converters = new Converter[schemaSize];

    this.parent = pvc;
    int parquetFieldIndex = 1;

    if (pvc == null) {
      throw new IllegalStateException("Missing parent value container");
    }

    myBuilder = builder;

    Descriptors.Descriptor protoDescriptor = builder.getDescriptorForType();

    for (Type parquetField : parquetSchema.getFields()) {
      Descriptors.FieldDescriptor protoField = protoDescriptor.findFieldByName(parquetField.getName());

      if (protoField == null) {
        String description = "Scheme mismatch \n\"" + parquetField + "\"" +
                "\n proto descriptor:\n" + protoDescriptor.toProto();
        throw new IncompatibleSchemaModificationException("Cant find \"" + parquetField.getName() + "\" " + description);
      }

      converters[parquetFieldIndex - 1] = newMessageConverter(myBuilder, protoField, parquetField);

      parquetFieldIndex++;
    }
  }
 
Example #22
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Converter newMessageConverter(final Message.Builder parentBuilder, final Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {

    boolean isRepeated = fieldDescriptor.isRepeated();

    ParentValueContainer parent;

    if (isRepeated) {
      parent = new ParentValueContainer() {
        @Override
        public void add(Object value) {
          parentBuilder.addRepeatedField(fieldDescriptor, value);
        }
      };
    } else {
      parent = new ParentValueContainer() {
        @Override
        public void add(Object value) {
          parentBuilder.setField(fieldDescriptor, value);
        }
      };
    }

    LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation();
    if (logicalTypeAnnotation == null) {
      return newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType);
    }

    return logicalTypeAnnotation.accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor<Converter>() {
      @Override
      public Optional<Converter> visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) {
        return of(new ListConverter(parentBuilder, fieldDescriptor, parquetType));
      }

      @Override
      public Optional<Converter> visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) {
        return of(new MapConverter(parentBuilder, fieldDescriptor, parquetType));
      }
    }).orElseGet(() -> newScalarConverter(parent, parentBuilder, fieldDescriptor, parquetType));
  }
 
Example #23
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex > 0) {
    throw new ParquetDecodingException("Unexpected multiple fields in the MAP wrapper");
  }
  return converter;
}
 
Example #24
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public TupleConverter(GroupType parquetSchema, Schema pigSchema, boolean elephantBirdCompatible, boolean columnIndexAccess) {
  this.parquetSchema = parquetSchema;
  this.elephantBirdCompatible = elephantBirdCompatible;
  try {
    this.schemaSize = max(parquetSchema.getFieldCount(), pigSchema.getFields().size());
    this.converters = new Converter[this.schemaSize];
    for (int i = 0, c = 0; i < schemaSize; i++) {
      FieldSchema field = pigSchema.getField(i);
      if(parquetSchema.containsField(field.alias) || columnIndexAccess) {
        Type type = getType(columnIndexAccess, field.alias, i);

        if(type != null) {
          final int index = i;
          converters[c++] = newConverter(field, type, new ParentValueContainer() {
            @Override
            void add(Object value) {
              TupleConverter.this.set(index, value);
            }
          }, elephantBirdCompatible, columnIndexAccess);
        }
      }

    }
  } catch (FrontendException e) {
    throw new ParquetDecodingException("can not initialize pig converter from:\n" + parquetSchema + "\n" + pigSchema, e);
  }
}
 
Example #25
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex != 0) {
    throw new IllegalArgumentException("bags have only one field. can't reach " + fieldIndex);
  }
  return child;
}
 
Example #26
Source File: MapConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex != 0) {
    throw new IllegalArgumentException("maps have only one field. can't reach " + fieldIndex);
  }
  return keyValue;
}
 
Example #27
Source File: MapConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex == 0) {
    return keyConverter;
  } else if (fieldIndex == 1) {
    return valueConverter;
  }
  throw new IllegalArgumentException("only the key (0) and value (1) fields expected: " + fieldIndex);
}
 
Example #28
Source File: AvroIndexedRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex == 0) {
    return keyConverter;
  } else if (fieldIndex == 1) {
    return valueConverter;
  }
  throw new IllegalArgumentException("only the key (0) and value (1) fields expected: " + fieldIndex);
}
 
Example #29
Source File: AvroRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static Converter newStringConverter(Schema schema, GenericData model,
                                            ParentValueContainer parent) {
  Class<?> stringableClass = getStringableClass(schema, model);
  if (stringableClass == String.class) {
    return new FieldStringConverter(parent);
  } else if (stringableClass == CharSequence.class) {
    return new AvroConverters.FieldUTF8Converter(parent);
  }
  return new FieldStringableConverter(parent, stringableClass);
}
 
Example #30
Source File: AvroRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {
  if (fieldIndex == 0) {
    return keyConverter;
  } else if (fieldIndex == 1) {
    return valueConverter;
  }
  throw new IllegalArgumentException("only the key (0) and value (1) fields expected: " + fieldIndex);
}