Java Code Examples for org.apache.avro.Schema.Field#name()

The following examples show how to use org.apache.avro.Schema.Field#name() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MarketoColumnMappingsTable.java    From components with Apache License 2.0 6 votes vote down vote up
public List<String> getMarketoColumns(Schema schema) {
    List<String> result = new ArrayList<>();
    Map<String, String> mappings = getInputedNameMappingsForMarketo();
    String marketoCol = null;
    String schemaCol = null;
    for (Field f : schema.getFields()) {
        marketoCol = mappings.get(f.name());
        if (StringUtils.isEmpty(marketoCol)) {
            schemaCol = f.getProp(SchemaConstants.TALEND_COLUMN_DB_COLUMN_NAME);
            if (!StringUtils.isEmpty(schemaCol)) {
                marketoCol = schemaCol;
            } else {
                marketoCol = f.name();
            }
        }
        result.add(marketoCol);
    }
    return result;
}
 
Example 2
Source File: AvroTypeUtil.java    From nifi with Apache License 2.0 6 votes vote down vote up
/**
 * Converts an Avro Schema to a RecordSchema
 *
 * @param avroSchema the Avro Schema to convert
 * @param schemaText the textual representation of the schema
 * @param schemaId the identifier of the schema
 * @return the Corresponding Record Schema
 */
public static RecordSchema createSchema(final Schema avroSchema, final String schemaText, final SchemaIdentifier schemaId) {
    if (avroSchema == null) {
        throw new IllegalArgumentException("Avro Schema cannot be null");
    }

    final String schemaFullName = avroSchema.getNamespace() + "." + avroSchema.getName();
    final SimpleRecordSchema recordSchema = schemaText == null ? new SimpleRecordSchema(schemaId) : new SimpleRecordSchema(schemaText, AVRO_SCHEMA_FORMAT, schemaId);
    recordSchema.setSchemaName(avroSchema.getName());
    recordSchema.setSchemaNamespace(avroSchema.getNamespace());
    final DataType recordSchemaType = RecordFieldType.RECORD.getRecordDataType(recordSchema);
    final Map<String, DataType> knownRecords = new HashMap<>();
    knownRecords.put(schemaFullName, recordSchemaType);

    final List<RecordField> recordFields = new ArrayList<>(avroSchema.getFields().size());
    for (final Field field : avroSchema.getFields()) {
        final String fieldName = field.name();
        final Schema fieldSchema = field.schema();
        final DataType dataType = AvroTypeUtil.determineDataType(fieldSchema, knownRecords);
        final boolean nullable = isNullable(fieldSchema);
        addFieldToList(recordFields, field, fieldName, fieldSchema, dataType, nullable);
    }

    recordSchema.setFields(recordFields);
    return recordSchema;
}
 
Example 3
Source File: SnowflakeWriter.java    From components with Apache License 2.0 5 votes vote down vote up
protected Object getFieldValue(Object inputValue, Field field) {
    Schema s = AvroUtils.unwrapIfNullable(field.schema());
    if (inputValue != null && inputValue instanceof String && ((String) inputValue).isEmpty()) {
        return emptyStringValue;
    } else if (null == inputValue || inputValue instanceof String) {
        return inputValue;
    } else if (AvroUtils.isSameType(s, AvroUtils._date())) {// TODO improve the performance as no need to get the
                                                            // runtimefield object from map every time
        // if customer set the schema by self instead of retrieve schema function,
        // the snowflake date type like : date, time, timestamp with time zone, timestamp with local time zone,
        // timestamp without time zone all may be the column type in database table
        // please see the test : SnowflakeDateTypeTestIT which show the details about terrible snowflake jdbc date
        // type support, all control by client!
        // so we have to process the date type and format it by different database data type
        boolean isUpperCase = false;
        if (sprops != null) {
            // keep the same logic with the method : getStringSchemaInfo as getStringSchemaInfo is used to init the
            // loader with the right db column name(are you sure?)
            isUpperCase = sprops.convertColumnsAndTableToUppercase.getValue();
        }
        String dbColumnName = field.getProp(SchemaConstants.TALEND_COLUMN_DB_COLUMN_NAME);
        if (dbColumnName == null) {
            dbColumnName = field.name();
        }
        dbColumnName = isUpperCase ? dbColumnName.toUpperCase() : dbColumnName;
        Field runtimeField = dbColumnName2RuntimeField.get(dbColumnName);

        if (runtimeField != null) {
            s = AvroUtils.unwrapIfNullable(runtimeField.schema());
        } else {
            // TODO this is the old action, we keep it if can't fetch the type by the schema db column name
            // consider to adjust it
            Date date = (Date) inputValue;
            return date.getTime();
        }
    }

    return formatIfAnySnowflakeDateType(inputValue, s);
}
 
Example 4
Source File: TMarketoInputProperties.java    From components with Apache License 2.0 5 votes vote down vote up
private Field getMigratedField(Field origin, Schema expectedSchema, String expectedDIType) {
    Field expectedField = new Schema.Field(origin.name(), expectedSchema, origin.doc(), origin.defaultVal(), origin.order());
    for (Map.Entry<String, Object> entry : origin.getObjectProps().entrySet()) {
        if ("di.column.talendType".equals(entry.getKey())) {
            expectedField.addProp("di.column.talendType", expectedDIType);
        } else {
            expectedField.addProp(entry.getKey(), entry.getValue());
        }
    }
    return expectedField;
}
 
Example 5
Source File: MarketoUtils.java    From components with Apache License 2.0 5 votes vote down vote up
public static Field generateNewField(Field origin) {
    Schema.Field field = new Schema.Field(origin.name(), origin.schema(), origin.doc(), origin.defaultVal(), origin.order());
    field.getObjectProps().putAll(origin.getObjectProps());
    for (Map.Entry<String, Object> entry : origin.getObjectProps().entrySet()) {
        field.addProp(entry.getKey(), entry.getValue());
    }
    return field;
}
 
Example 6
Source File: MarketoSourceOrSink.java    From components with Apache License 2.0 5 votes vote down vote up
public static List<Field> getSchemaFieldsList(Schema schema) {
    List<Field> result = new ArrayList<>();
    for (Field f : schema.getFields()) {
        Field nf = new Field(f.name(), f.schema(), f.doc(), f.defaultVal());
        nf.getObjectProps().putAll(f.getObjectProps());
        for (Map.Entry<String, Object> entry : f.getObjectProps().entrySet()) {
            nf.addProp(entry.getKey(), entry.getValue());
        }
        result.add(nf);
    }
    return result;
}
 
Example 7
Source File: BulkResultIndexedRecordConverter.java    From components with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
@Override
public Object get(int i) {
    // Lazy initialization of the cached converter objects.
    if (names == null) {
        names = new String[getSchema().getFields().size()];
        fieldConverter = new AvroConverter[names.length];
        for (int j = 0; j < names.length; j++) {
            Field f = getSchema().getFields().get(j);
            names[j] = f.name();
            fieldConverter[j] = SalesforceAvroRegistryString.get().getConverterFromString(f);
        }
    }
    return fieldConverter[i].convertToAvro(value.getValue(names[i]));
}
 
Example 8
Source File: SObjectAdapterFactory.java    From components with Apache License 2.0 5 votes vote down vote up
private void init() {
    if (names == null) {
        List<Schema.Field> fields = getSchema().getFields();
        names = new String[fields.size()];
        fieldConverter = new AvroConverter[names.length];
        name2converter = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
        for (int j = 0; j < names.length; j++) {
            Field f = getSchema().getFields().get(j);
            names[j] = f.name();
            fieldConverter[j] = SalesforceAvroRegistry.get().getConverterFromString(f);
            name2converter.put(f.name(), fieldConverter[j]);
            name2converter.put(rootType + schema.getProp(SalesforceSchemaConstants.COLUMNNAME_DELIMTER) + f.name(), fieldConverter[j]);
        }
    }
}
 
Example 9
Source File: AvroRecordConverter.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
/**
 * @return Any fields in the output schema that are not mapped or are mapped
 *         by a non-existent input field.
 */
public Collection<String> getUnmappedFields() {
    List<String> result = Lists.newArrayList();
    for (Field f : outputSchema.getFields()) {
        String fieldName = f.name();
        if (fieldMapping.containsKey(fieldName)) {
            fieldName = fieldMapping.get(fieldName);
        }

        Schema currentSchema = inputSchema;
        while (fieldName.contains(".")) {
            // Recurse down the schema to find the right field.
            int dotIndex = fieldName.indexOf('.');
            String entityName = fieldName.substring(0, dotIndex);
            // Get the schema. In case we had an optional record, choose
            // just the record.
            currentSchema = getNonNullSchema(currentSchema);
            if (currentSchema.getField(entityName) == null) {
                // Tried to step into a schema that doesn't exist. Break out
                // of the loop
                break;
            }
            currentSchema = currentSchema.getField(entityName).schema();
            fieldName = fieldName.substring(dotIndex + 1);
        }
        if (currentSchema == null
                || getNonNullSchema(currentSchema).getField(fieldName) == null) {
            result.add(f.name());
        }
    }
    return result;
}
 
Example 10
Source File: SnowflakeWriter.java    From components with Apache License 2.0 5 votes vote down vote up
private static StringSchemaInfo getStringSchemaInfo(TSnowflakeOutputProperties outputProperties, Schema mainSchema,
        List<Field> columns) {
    boolean isUpperCase = false;
    boolean upsert = false;
    if (outputProperties != null) {
        isUpperCase = outputProperties.convertColumnsAndTableToUppercase.getValue();
        upsert = UPSERT.equals(outputProperties.outputAction.getValue());
    }

    List<String> keyStr = new ArrayList<>();
    List<String> columnsStr = new ArrayList<>();

    int i = 0;
    for (Field overlapField : columns) {
        Field f = overlapField == null ? mainSchema.getFields().get(i) : overlapField;
        i++;
        if (Boolean.valueOf(f.getProp(SnowflakeAvroRegistry.TALEND_FIELD_AUTOINCREMENTED))) {
            continue;
        }
        String dbColumnName = f.getProp(SchemaConstants.TALEND_COLUMN_DB_COLUMN_NAME);
        if (dbColumnName == null) {
            dbColumnName = f.name();
        }

        String fName = isUpperCase ? dbColumnName.toUpperCase() : dbColumnName;
        columnsStr.add(fName);
        if (null != f.getProp(SchemaConstants.TALEND_COLUMN_IS_KEY)) {
            keyStr.add(fName);
        }
    }

    if (upsert) {
        keyStr.clear();
        String upserKeyColumn = outputProperties.upsertKeyColumn.getValue();
        keyStr.add(isUpperCase ? upserKeyColumn.toUpperCase() : upserKeyColumn);
    }

    return new StringSchemaInfo(keyStr, columnsStr);
}
 
Example 11
Source File: IntArraysTest.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void before()
    throws Exception {
  final String filePath =
      TestUtils.getFileFromResourceUrl(DictionariesTest.class.getClassLoader().getResource(AVRO_DATA));
  if (INDEX_DIR.exists()) {
    FileUtils.deleteQuietly(INDEX_DIR);
  }

  final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);

  final SegmentGeneratorConfig config = SegmentTestUtils
      .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "weeksSinceEpochSunday",
          TimeUnit.DAYS, "test");
  // The segment generation code in SegmentColumnarIndexCreator will throw
  // exception if start and end time in time column are not in acceptable
  // range. For this test, we first need to fix the input avro data
  // to have the time column values in allowed range. Until then, the check
  // is explicitly disabled
  config.setSkipTimeValueCheck(true);
  driver.init(config);
  driver.build();

  final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
  final org.apache.avro.Schema avroSchema = avroReader.getSchema();
  final String[] columns = new String[avroSchema.getFields().size()];
  int i = 0;
  for (final Field f : avroSchema.getFields()) {
    columns[i] = f.name();
    i++;
  }
}
 
Example 12
Source File: AvroUtils.java    From Cubert with Apache License 2.0 5 votes vote down vote up
public static Schema getSchema(SeekableInput input) throws IOException
{
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> dataFileReader =
            new DataFileReader<GenericRecord>(input, datumReader);
    Schema schema = dataFileReader.getSchema();

    if (PadDefaultNullsToSchema)
    {
        // a list of "cloned" fields, with optional default value set to null
        ArrayList<Field> paddedFields = new ArrayList<Field>();

        for (Field field: schema.getFields())
        {
            // should this field be padded?
            boolean needsNullPadding = (field.schema() != null) // the field has nested schema
                && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION
                && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type

            JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue();

            Field f = new Field(field.name(), field.schema(), field.doc(), defValue);
            paddedFields.add(f);
        }

        schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
        schema.setFields(paddedFields);
    }

    return schema;
}
 
Example 13
Source File: FlattenNestedKeyConverter.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
public Iterable<GenericRecord> convertRecord(Schema outputSchema, GenericRecord inputRecord, WorkUnitState workUnit)
    throws DataConversionException {
  // No fields need flatten
  if (fieldNameMap.size() == 0) {
    return new SingleRecordIterable<>(inputRecord);
  }

  GenericRecord outputRecord = new GenericData.Record(outputSchema);
  for (Field field : outputSchema.getFields()) {
    String fieldName = field.name();
    if (fieldNameMap.containsKey(fieldName)) {
      // Skip new field for now
      continue;
    }

    outputRecord.put(fieldName, inputRecord.get(fieldName));
  }

  // Deal with new fields
  for (Map.Entry<String, String> entry : fieldNameMap.entrySet()) {
    Optional<Object> optional = AvroUtils.getFieldValue(inputRecord, entry.getValue());
    if (!optional.isPresent()) {
      throw new DataConversionException("Unable to get field value with location: " + entry.getValue());
    }
    outputRecord.put(entry.getKey(), optional.get());
  }

  return new SingleRecordIterable<>(outputRecord);
}
 
Example 14
Source File: CSVUtils.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
/**
 *
 */
private static JsonNode retrieveDefaultFieldValue(Field field) {
    JsonNode jsonNode = field.defaultValue();
    if (null == jsonNode) {
        throw new IllegalArgumentException("The field '" + field.name() + "' is NULL and there is no default value supplied in the Avro Schema");
    }
    return jsonNode;
}
 
Example 15
Source File: AvroRecordConverter.java    From nifi with Apache License 2.0 5 votes vote down vote up
/**
 * @return Any fields in the output schema that are not mapped or are mapped
 *         by a non-existent input field.
 */
public Collection<String> getUnmappedFields() {
    List<String> result = Lists.newArrayList();
    for (Field f : outputSchema.getFields()) {
        String fieldName = f.name();
        if (fieldMapping.containsKey(fieldName)) {
            fieldName = fieldMapping.get(fieldName);
        }

        Schema currentSchema = inputSchema;
        while (fieldName.contains(".")) {
            // Recurse down the schema to find the right field.
            int dotIndex = fieldName.indexOf('.');
            String entityName = fieldName.substring(0, dotIndex);
            // Get the schema. In case we had an optional record, choose
            // just the record.
            currentSchema = getNonNullSchema(currentSchema);
            if (currentSchema.getField(entityName) == null) {
                // Tried to step into a schema that doesn't exist. Break out
                // of the loop
                break;
            }
            currentSchema = currentSchema.getField(entityName).schema();
            fieldName = fieldName.substring(dotIndex + 1);
        }
        if (currentSchema == null
                || getNonNullSchema(currentSchema).getField(fieldName) == null) {
            result.add(f.name());
        }
    }
    return result;
}
 
Example 16
Source File: AvroUtils.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
/**
 * Return whether the Avro field is a single-value field.
 */
public static boolean isSingleValueField(Field field) {
  try {
    org.apache.avro.Schema fieldSchema = extractSupportedSchema(field.schema());
    return fieldSchema.getType() != org.apache.avro.Schema.Type.ARRAY;
  } catch (Exception e) {
    throw new RuntimeException("Caught exception while extracting non-null schema from field: " + field.name(), e);
  }
}
 
Example 17
Source File: AvroTypeUtil.java    From nifi with Apache License 2.0 4 votes vote down vote up
public static DataType determineDataType(final Schema avroSchema, Map<String, DataType> knownRecordTypes) {

        if (knownRecordTypes == null) {
            throw new IllegalArgumentException("'knownRecordTypes' cannot be null.");
        }

        final Type avroType = avroSchema.getType();

        final LogicalType logicalType = avroSchema.getLogicalType();
        if (logicalType != null) {
            final String logicalTypeName = logicalType.getName();
            switch (logicalTypeName) {
                case LOGICAL_TYPE_DATE:
                    return RecordFieldType.DATE.getDataType();
                case LOGICAL_TYPE_TIME_MILLIS:
                case LOGICAL_TYPE_TIME_MICROS:
                    return RecordFieldType.TIME.getDataType();
                case LOGICAL_TYPE_TIMESTAMP_MILLIS:
                case LOGICAL_TYPE_TIMESTAMP_MICROS:
                    return RecordFieldType.TIMESTAMP.getDataType();
                case LOGICAL_TYPE_DECIMAL:
                    final LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) logicalType;
                    return RecordFieldType.DECIMAL.getDecimalDataType(decimal.getPrecision(), decimal.getScale());
            }
        }

        switch (avroType) {
            case ARRAY:
                return RecordFieldType.ARRAY.getArrayDataType(determineDataType(avroSchema.getElementType(), knownRecordTypes));
            case BYTES:
            case FIXED:
                return RecordFieldType.ARRAY.getArrayDataType(RecordFieldType.BYTE.getDataType());
            case BOOLEAN:
                return RecordFieldType.BOOLEAN.getDataType();
            case DOUBLE:
                return RecordFieldType.DOUBLE.getDataType();
            case ENUM:
            case STRING:
                return RecordFieldType.STRING.getDataType();
            case FLOAT:
                return RecordFieldType.FLOAT.getDataType();
            case INT:
                return RecordFieldType.INT.getDataType();
            case LONG:
                return RecordFieldType.LONG.getDataType();
            case RECORD: {
                String schemaFullName = avroSchema.getNamespace() + "." + avroSchema.getName();

                if (knownRecordTypes.containsKey(schemaFullName)) {
                    return knownRecordTypes.get(schemaFullName);
                } else {
                    SimpleRecordSchema recordSchema = new SimpleRecordSchema(SchemaIdentifier.EMPTY);
                    DataType recordSchemaType = RecordFieldType.RECORD.getRecordDataType(recordSchema);
                    knownRecordTypes.put(schemaFullName, recordSchemaType);

                    final List<Field> avroFields = avroSchema.getFields();
                    final List<RecordField> recordFields = new ArrayList<>(avroFields.size());

                    for (final Field field : avroFields) {
                        final String fieldName = field.name();
                        final Schema fieldSchema = field.schema();
                        final DataType fieldType = determineDataType(fieldSchema, knownRecordTypes);
                        final boolean nullable = isNullable(fieldSchema);
                        addFieldToList(recordFields, field, fieldName, fieldSchema, fieldType, nullable);
                    }

                    recordSchema.setFields(recordFields);
                    return recordSchemaType;
                }
            }
            case NULL:
                return RecordFieldType.STRING.getDataType();
            case MAP:
                final Schema valueSchema = avroSchema.getValueType();
                final DataType valueType = determineDataType(valueSchema, knownRecordTypes);
                return RecordFieldType.MAP.getMapDataType(valueType);
            case UNION: {
                final List<Schema> nonNullSubSchemas = getNonNullSubSchemas(avroSchema);

                if (nonNullSubSchemas.size() == 1) {
                    return determineDataType(nonNullSubSchemas.get(0), knownRecordTypes);
                }

                final List<DataType> possibleChildTypes = new ArrayList<>(nonNullSubSchemas.size());
                for (final Schema subSchema : nonNullSubSchemas) {
                    final DataType childDataType = determineDataType(subSchema, knownRecordTypes);
                    possibleChildTypes.add(childDataType);
                }

                return RecordFieldType.CHOICE.getChoiceDataType(possibleChildTypes);
            }
        }

        return null;
    }
 
Example 18
Source File: MarketoOutputWriter.java    From components with Apache License 2.0 4 votes vote down vote up
public IndexedRecord fillRecord(SyncStatus status, Schema schema, IndexedRecord record) {
    Boolean isDynamic = Boolean.FALSE;
    Schema currentSchema = schema;
    if (AvroUtils.isIncludeAllFields(schema)) {
        isDynamic = true;
        if (dynamicSchema == null) {
            dynamicSchema = MarketoSourceOrSink.mergeDynamicSchemas(record.getSchema(), schema);
        }
        currentSchema = dynamicSchema;
    }
    IndexedRecord outRecord = new Record(currentSchema);
    for (Field f : currentSchema.getFields()) {
        switch (f.name()) {
        case FIELD_LEAD_ID:
        case FIELD_ID_SOAP:
        case FIELD_ID_REST:
        case FIELD_CAMPAIGN_ID:
            // when the request failed, get it from input record
            if (status.getId() == null) {
                try {
                    outRecord.put(currentSchema.getField(f.name()).pos(), record.get(inputSchema.getField(f.name()).pos()));
                } catch (NullPointerException e) {
                    LOG.error("Could not find field `{}` in schema : {}.", f.name(), e.getMessage());
                }
            } else {
                outRecord.put(f.pos(), status.getId());
            }
            break;
        case FIELD_SUCCESS:
            outRecord.put(f.pos(), Boolean.parseBoolean(status.getStatus()));
            break;
        case FIELD_STATUS:
            outRecord.put(f.pos(), status.getStatus());
            break;
        case FIELD_ERROR_MSG:
        case FIELD_REASON:
            outRecord.put(f.pos(), status.getAvailableReason());
            break;
        case FIELD_MARKETO_GUID:
            outRecord.put(f.pos(), status.getMarketoGUID());
            break;
        case FIELD_SEQ:
            outRecord.put(f.pos(), status.getSeq());
            break;
        default:
            if (isDynamic) {
                outRecord.put(currentSchema.getField(f.name()).pos(), record.get(f.pos()));
            } else {
                outRecord.put(currentSchema.getField(f.name()).pos(), record.get(inputSchema.getField(f.name()).pos()));
            }
        }
    }
    return outRecord;
}
 
Example 19
Source File: PigSchema2Avro.java    From Cubert with Apache License 2.0 4 votes vote down vote up
/**
 * Validate a Pig tuple is compatible with Avro record. If the Avro schema 
 * is not complete (with uncovered fields), then convert those fields using 
 * methods in set 1. 
 * 
 * Notice that users can get rid of Pig tuple wrappers, e.g. an Avro schema
 * "int" is compatible with a Pig schema "T:(int)"
 * 
 */
protected static Schema validateAndConvertRecord(Schema avroSchema, ResourceFieldSchema[] pigFields) throws IOException {

    /* Get rid of Pig tuple wrappers. */
    if (!avroSchema.getType().equals(Schema.Type.RECORD)) {
        if (pigFields.length != 1)
            throw new IOException("Expect only one field in Pig tuple schema. Avro schema is " + avroSchema.getType());

        return validateAndConvert(avroSchema, pigFields[0]);
    }

    /* validate and convert a pig tuple with avro record */
    boolean isPartialSchema = AvroStorageUtils.isUDPartialRecordSchema(avroSchema);
    AvroStorageLog.details("isPartialSchema=" + isPartialSchema);

    String typeName = isPartialSchema ? getRecordName() : avroSchema.getName();
    Schema outSchema = Schema.createRecord(typeName, avroSchema.getDoc(), avroSchema.getNamespace(), false);

    List<Schema.Field> inFields = avroSchema.getFields();
    if (!isPartialSchema && inFields.size() != pigFields.length) {
        throw new IOException("Expect " + inFields.size() + " fields in pig schema." + " But there are " + pigFields.length);
    }

    List<Schema.Field> outFields = new ArrayList<Schema.Field>();

    for (int i = 0; i < pigFields.length; i++) {
        /* get user defined avro field schema */
        Field inputField = isPartialSchema ? AvroStorageUtils.getUDField(avroSchema, i) : inFields.get(i);

        /* get schema */
        Schema fieldSchema = null;
        if (inputField == null) { 
            /* convert pig schema (nullable) */
            fieldSchema = convert(pigFields[i], true);
        } else if (inputField.schema() == null) { 
            /* convert pig schema (not-null) */
            fieldSchema = convert(pigFields[i], false);
        } else { 
            /* validate pigFields[i] with given avro schema */
            fieldSchema = validateAndConvert(inputField.schema(),
                                            pigFields[i]);
        }

        /* get field name of output */
        String outname = (isPartialSchema) ? pigFields[i].getName() : inputField.name();
        if (outname == null)
            outname = FIELD_NAME + "_" + i; // field name cannot be null

        /* get doc of output */
        String doc = (isPartialSchema) ? pigFields[i].getDescription() : inputField.doc();

        JsonNode defaultvalue = (inputField != null) ? inputField.defaultValue() : null;

        outFields.add(new Field(outname, fieldSchema, doc, defaultvalue));

    }

    outSchema.setFields(outFields);
    return outSchema;

}
 
Example 20
Source File: AvroStorageUtils.java    From spork with Apache License 2.0 3 votes vote down vote up
/**
 * When merging multiple avro record schemas, we build a map (schemaToMergedSchemaMap)
 * to associate each input record with a remapping of its fields relative to the merged
 * schema. Take the following two schemas for example:
 *
 * // path1
 * { "type": "record",
 *   "name": "x",
 *   "fields": [ { "name": "xField", "type": "string" } ]
 * }
 *
 * // path2
 * { "type": "record",
 *   "name": "y",
 *   "fields": [ { "name": "yField", "type": "string" } ]
 * }
 *
 * The merged schema will be something like this:
 *
 * // merged
 * { "type": "record",
 *   "name": "merged",
 *   "fields": [ { "name": "xField", "type": "string" },
 *               { "name": "yField", "type": "string" } ]
 * }
 *
 * The schemaToMergedSchemaMap will look like this:
 *
 * // schemaToMergedSchemaMap
 * { path1 : { 0 : 0 },
 *   path2 : { 0 : 1 }
 * }
 *
 * The meaning of the map is:
 * - The field at index '0' of 'path1' is moved to index '0' in merged schema.
 * - The field at index '0' of 'path2' is moved to index '1' in merged schema.
 *
 * With this map, we can now remap the field position of the original schema to
 * that of the merged schema. This is necessary because in the backend, we don't
 * use the merged avro schema but embedded avro schemas of input files to load
 * them. Therefore, we must relocate each field from old positions in the original
 * schema to new positions in the merged schema.
 *
 * @param mergedSchema new schema generated from multiple input schemas
 * @param mergedFiles input avro files that are merged
 * @return schemaToMergedSchemaMap that maps old position of each field in the
 * original schema to new position in the new schema
 * @throws IOException
 */
public static Map<Path, Map<Integer, Integer>> getSchemaToMergedSchemaMap(
        Schema mergedSchema, Map<Path, Schema> mergedFiles) throws IOException {

    if (!mergedSchema.getType().equals(Schema.Type.RECORD)) {
        throw new IOException("Remapping of non-record schemas is not supported");
    }

    Map<Path, Map<Integer, Integer>> result =
            new HashMap<Path, Map<Integer, Integer>>(mergedFiles.size());

    // map from field position in old schema to field position in new schema
    for (Map.Entry<Path, Schema> entry : mergedFiles.entrySet()) {
        Path path = entry.getKey();
        Schema schema = entry.getValue();
        if (!schema.getType().equals(Schema.Type.RECORD)) {
            throw new IOException("Remapping of non-record schemas is not supported");
        }
        List<Field> fields = schema.getFields();
        Map<Integer, Integer> oldPos2NewPos = result.get(path);
        if (oldPos2NewPos == null) {
            oldPos2NewPos = new HashMap<Integer, Integer>(fields.size());
            result.put(path, oldPos2NewPos);
        }
        for (Field field : fields) {
            String fieldName = field.name();
            int oldPos = schema.getField(fieldName).pos();
            int newPos = mergedSchema.getField(fieldName).pos();
            oldPos2NewPos.put(oldPos, newPos);
        }
    }
    return result;
}