Java Code Examples for org.apache.avro.Schema#setFields()

The following examples show how to use org.apache.avro.Schema#setFields() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
private Schema convertFields(String name, List<Type> parquetFields) {
  List<Schema.Field> fields = new ArrayList<Schema.Field>();
  for (Type parquetType : parquetFields) {
    Schema fieldSchema = convertField(parquetType);
    if (parquetType.isRepetition(REPEATED)) {
      throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType);
    } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) {
      fields.add(new Schema.Field(
          parquetType.getName(), optional(fieldSchema), null, NULL_VALUE));
    } else { // REQUIRED
      fields.add(new Schema.Field(
          parquetType.getName(), fieldSchema, null, (Object) null));
    }
  }
  Schema schema = Schema.createRecord(name, null, null, false);
  schema.setFields(fields);
  return schema;
}
 
Example 2
public static Optional<Schema> getKeySchemaFromRecord(Schema record) {
  Preconditions.checkArgument(record.getType() == Schema.Type.RECORD);

  List<Field> fields = Lists.newArrayList();
  for (Field field : record.getFields()) {
    Optional<Schema> newFieldSchema = getKeySchema(field);
    if (newFieldSchema.isPresent()) {
      fields.add(new Field(field.name(), newFieldSchema.get(), field.doc(), field.defaultValue()));
    }
  }
  if (!fields.isEmpty()) {
    Schema newSchema = Schema.createRecord(record.getName(), record.getDoc(), record.getName(), false);
    newSchema.setFields(fields);
    return Optional.of(newSchema);
  } else {
    return Optional.absent();
  }
}
 
Example 3
Source Project: digdag   File: RedshiftIT.java    License: Apache License 2.0 6 votes vote down vote up
private byte[] avroTestData(List<Schema.Field> fields, List<Map<String, Object>> records)
        throws IOException
{
    Schema schema = Schema.createRecord("testdata", null, null, false);
    schema.setFields(fields);

    ByteArrayOutputStream out = new ByteArrayOutputStream();
    GenericDatumWriter<GenericData.Record> datum = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericData.Record> writer = new DataFileWriter<>(datum);
    writer.create(schema, out);
    for (Map<String, Object> record : records) {
        GenericData.Record r = new GenericData.Record(schema);
        for (Map.Entry<String, Object> item : record.entrySet()) {
            r.put(item.getKey(), item.getValue());
        }
        writer.append(r);
    }
    writer.close();

    return out.toByteArray();
}
 
Example 4
@Test
public void testOptionalArrayElement() throws Exception {
  Schema schema = Schema.createRecord("record1", null, null, false);
  Schema optionalIntArray = Schema.createArray(optional(Schema.create(INT)));
  schema.setFields(Arrays.asList(
      new Schema.Field("myintarray", optionalIntArray, null, null)
  ));
  testRoundTripConversion(
      NEW_BEHAVIOR, schema,
      "message record1 {\n" +
          "  required group myintarray (LIST) {\n" +
          "    repeated group list {\n" +
          "      optional int32 element;\n" +
          "    }\n" +
          "  }\n" +
          "}\n");
}
 
Example 5
Source Project: dbeam   File: InputAvroSchemaTest.java    License: Apache License 2.0 6 votes vote down vote up
private Schema createRecordSchema(
    final String recordName,
    final String recordDoc,
    final String recordNamespace,
    final String[] fieldNames,
    final String[] fieldDocs) {
  Schema inputSchema = Schema.createRecord(recordName, recordDoc, recordNamespace, false);
  final List<Schema.Field> fields = new ArrayList<>();
  for (int i = 0; i < fieldNames.length; i++) {
    String fieldName = fieldNames[i];
    String fieldDoc = fieldDocs[i];
    fields.add(new Schema.Field(fieldName, inputSchema, fieldDoc));
  }
  inputSchema.setFields(fields);

  return inputSchema;
}
 
Example 6
@Override
public Schema getEvolvedSchema(Schema original) {
  List<Schema.Field> fields = Lists.newArrayList();
  fields.add(new Schema.Field("new",
    Schema.createUnion(ImmutableList.of(
        Schema.create(Schema.Type.NULL),
        Schema.create(Schema.Type.STRING))),
    "New field", NullNode.getInstance()));

  for (Schema.Field field : original.getFields()) {
    fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
      field.defaultValue()));
  }

  Schema evolved = Schema.createRecord(original.getName(), original.getDoc(),
    original.getNamespace(), false);
  evolved.setFields(fields);

  return evolved;
}
 
Example 7
Source Project: funcj   File: AvroSchemaCodecFormat.java    License: MIT License 6 votes vote down vote up
@Override
public Object encode(
        CodecCoreEx<WithSchema, Object, Config> core,
        T value,
        Object out
) {
    final String path = out + "." + type.getSimpleName();
    final List<Schema.Field> fieldSchema =
            fields.entrySet().stream()
                    .map(en -> new Schema.Field(
                            en.getKey(),
                            (Schema)en.getValue().encodeField(value, path)))
                    .collect(toList());
    final Schema schema = Schema.createRecord(path, null, null, false);
    schema.setFields(fieldSchema);
    return schema;
}
 
Example 8
@Test
public void testArrayOfOptionalRecordsOldBehavior() throws Exception {
  Schema innerRecord = Schema.createRecord("InnerRecord", null, null, false);
  Schema optionalString = optional(Schema.create(Schema.Type.STRING));
  innerRecord.setFields(Lists.newArrayList(
      new Schema.Field("s1", optionalString, null, JsonProperties.NULL_VALUE),
      new Schema.Field("s2", optionalString, null, JsonProperties.NULL_VALUE)
  ));
  Schema schema = Schema.createRecord("HasArray", null, null, false);
  schema.setFields(Lists.newArrayList(
      new Schema.Field("myarray", Schema.createArray(optional(innerRecord)),
          null, null)
  ));
  System.err.println("Avro schema: " + schema.toString(true));

  // Cannot use round-trip assertion because InnerRecord optional is removed
  testAvroToParquetConversion(schema, "message HasArray {\n" +
      "  required group myarray (LIST) {\n" +
      "    repeated group array {\n" +
      "      optional binary s1 (UTF8);\n" +
      "      optional binary s2 (UTF8);\n" +
      "    }\n" +
      "  }\n" +
      "}\n");
}
 
Example 9
Source Project: kite   File: TestTableConversion.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testConvertStructs() {
  Schema recordSchema = Schema.createRecord("inner", null, null, false);
  recordSchema.setFields(Lists.newArrayList(
      new Schema.Field("a",
          optional(Schema.create(Schema.Type.INT)), null, NULL_DEFAULT),
      new Schema.Field("b",
          optional(Schema.create(Schema.Type.BYTES)), null, NULL_DEFAULT)
  ));
  Schema structOfStructsSchema = Schema.createRecord("test", null, null, false);
  structOfStructsSchema.setFields(Lists.newArrayList(
      new Schema.Field("str",
          optional(Schema.create(Schema.Type.STRING)), null, NULL_DEFAULT),
      new Schema.Field("inner", optional(recordSchema), null, NULL_DEFAULT)
  ));

  Assert.assertEquals("Should convert struct of structs",
      structOfStructsSchema,
      HiveSchemaConverter.convert(
          startPath, "test", STRUCT_OF_STRUCTS_TYPE, NO_REQUIRED_FIELDS));
}
 
Example 10
Source Project: flink   File: AvroKeyValueSinkWriter.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Creates a KeyValuePair generic record schema.
 *
 * @return A schema for a generic record with two fields: 'key' and
 *         'value'.
 */
public static Schema getSchema(Schema keySchema, Schema valueSchema) {
	Schema schema = Schema.createRecord(KEY_VALUE_PAIR_RECORD_NAME,
			"A key/value pair", KEY_VALUE_PAIR_RECORD_NAMESPACE, false);
	schema.setFields(Arrays.asList(new Schema.Field(KEY_FIELD,
			keySchema, "The key", null), new Schema.Field(VALUE_FIELD,
			valueSchema, "The value", null)));
	return schema;
}
 
Example 11
Source Project: spork   File: PigSchema2Avro.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Convert pig data to Avro record
 * 
 */
protected static Schema convertRecord(ResourceFieldSchema[] pigFields, boolean nullable) throws IOException {

    AvroStorageLog.funcCall("convertRecord");

    // Type name is required for Avro record
    String typeName = getRecordName();
    Schema outSchema = Schema.createRecord(typeName, null, null, false);

    List<Schema.Field> outFields = new ArrayList<Schema.Field>();
    for (int i = 0; i < pigFields.length; i++) {

        /* get schema */
        Schema fieldSchema = convert(pigFields[i], nullable);

        /* get field name of output */
        String outname = pigFields[i].getName();
        if (outname == null)
            outname = FIELD_NAME + "_" + i; // field name cannot be null

        /* get doc of output */
        String desc = pigFields[i].getDescription();

        outFields.add(new Field(outname, fieldSchema, desc, null));
    }

    outSchema.setFields(outFields);
    return outSchema;

}
 
Example 12
Source Project: xml-avro   File: SchemaBuilder.java    License: Apache License 2.0 5 votes vote down vote up
private Schema createGroupSchema(String name, XSModelGroup groupTerm) {
  Schema record = Schema.createRecord(name, null, null, false);
  schemas.put(name, record);

  Map<String, Schema.Field> fields = new HashMap<>();
  createGroupFields(groupTerm, fields, false);
  record.setFields(new ArrayList<>(fields.values()));

  return Schema.createArray(record);
}
 
Example 13
Source Project: ml-ease   File: RegressionTest.java    License: Apache License 2.0 5 votes vote down vote up
private JobConf createJobConf(Class<? extends AvroMapper> mapperClass,
                              Class<? extends AvroReducer> reducerClass) throws IOException, URISyntaxException
{
  JobConf conf = createJobConf();
  Schema inputSchema = Util.removeUnion(AvroUtils.getAvroInputSchema(conf));
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  _logger.info("Input Schema=" + inputSchema.toString());
  List<Schema.Field> inputFields = inputSchema.getFields();
  Schema.Field predField =
      new Schema.Field("pred", Schema.create(Type.FLOAT), "", null);
  List<Schema.Field> outputFields = new LinkedList<Schema.Field>();
  for (Schema.Field field : inputFields)
  {
    outputFields.add(new Schema.Field(field.name(),
                                      field.schema(),
                                      field.doc(),
                                      null));
  }
  outputFields.add(predField);
  Schema outputSchema =
      Schema.createRecord("AdmmTestOutput",
                          "Test output for AdmmTest",
                          "com.linkedin.lab.regression.avro",
                          false);
  outputSchema.setFields(outputFields);
  AvroJob.setOutputSchema(conf, outputSchema);
  AvroJob.setMapOutputSchema(conf,
                             Pair.getPairSchema(Schema.create(Type.FLOAT), outputSchema));
  AvroJob.setMapperClass(conf, mapperClass);
  AvroJob.setReducerClass(conf, reducerClass);
  return conf;
}
 
Example 14
public Schema generate(String schemaNameOverride) throws IOException {
  ClassWriter classWriter = new ClassWriter(options, connManager,
      tableName, null);
  Map<String, Integer> columnTypes = classWriter.getColumnTypes();
  String[] columnNames = classWriter.getColumnNames(columnTypes);

  List<Field> fields = new ArrayList<Field>();
  for (String columnName : columnNames) {
    String cleanedCol = AvroUtil.toAvroIdentifier(ClassWriter.toJavaIdentifier(columnName));
    int sqlType = columnTypes.get(columnName);
    Schema avroSchema = toAvroSchema(sqlType, columnName);
    Field field = new Field(cleanedCol, avroSchema, null,  NullNode.getInstance());
    field.addProp("columnName", columnName);
    field.addProp("sqlType", Integer.toString(sqlType));
    fields.add(field);
  }

  TableClassName tableClassName = new TableClassName(options);
  String shortClassName = tableClassName.getShortClassForTable(tableName);
  String avroTableName = (tableName == null ? TableClassName.QUERY_RESULT : tableName);
  String avroName = schemaNameOverride != null ? schemaNameOverride :
      (shortClassName == null ? avroTableName : shortClassName);
  String avroNamespace = tableClassName.getPackageForTable();

  String doc = "Sqoop import of " + avroTableName;
  Schema schema = Schema.createRecord(avroName, doc, avroNamespace, false);
  schema.setFields(fields);
  schema.addProp("tableName", avroTableName);
  return schema;
}
 
Example 15
Source Project: hudi   File: HoodieAvroUtils.java    License: Apache License 2.0 5 votes vote down vote up
private static Schema initRecordKeySchema() {
  Schema.Field recordKeyField =
      new Schema.Field(HoodieRecord.RECORD_KEY_METADATA_FIELD, METADATA_FIELD_SCHEMA, "", NullNode.getInstance());
  Schema recordKeySchema = Schema.createRecord("HoodieRecordKey", "", "", false);
  recordKeySchema.setFields(Collections.singletonList(recordKeyField));
  return recordKeySchema;
}
 
Example 16
Source Project: Cubert   File: PigSchema2Avro.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Validate a Pig tuple is compatible with Avro record. If the Avro schema 
 * is not complete (with uncovered fields), then convert those fields using 
 * methods in set 1. 
 * 
 * Notice that users can get rid of Pig tuple wrappers, e.g. an Avro schema
 * "int" is compatible with a Pig schema "T:(int)"
 * 
 */
protected static Schema validateAndConvertRecord(Schema avroSchema, ResourceFieldSchema[] pigFields) throws IOException {

    /* Get rid of Pig tuple wrappers. */
    if (!avroSchema.getType().equals(Schema.Type.RECORD)) {
        if (pigFields.length != 1)
            throw new IOException("Expect only one field in Pig tuple schema. Avro schema is " + avroSchema.getType());

        return validateAndConvert(avroSchema, pigFields[0]);
    }

    /* validate and convert a pig tuple with avro record */
    boolean isPartialSchema = AvroStorageUtils.isUDPartialRecordSchema(avroSchema);
    AvroStorageLog.details("isPartialSchema=" + isPartialSchema);

    String typeName = isPartialSchema ? getRecordName() : avroSchema.getName();
    Schema outSchema = Schema.createRecord(typeName, avroSchema.getDoc(), avroSchema.getNamespace(), false);

    List<Schema.Field> inFields = avroSchema.getFields();
    if (!isPartialSchema && inFields.size() != pigFields.length) {
        throw new IOException("Expect " + inFields.size() + " fields in pig schema." + " But there are " + pigFields.length);
    }

    List<Schema.Field> outFields = new ArrayList<Schema.Field>();

    for (int i = 0; i < pigFields.length; i++) {
        /* get user defined avro field schema */
        Field inputField = isPartialSchema ? AvroStorageUtils.getUDField(avroSchema, i) : inFields.get(i);

        /* get schema */
        Schema fieldSchema = null;
        if (inputField == null) { 
            /* convert pig schema (nullable) */
            fieldSchema = convert(pigFields[i], true);
        } else if (inputField.schema() == null) { 
            /* convert pig schema (not-null) */
            fieldSchema = convert(pigFields[i], false);
        } else { 
            /* validate pigFields[i] with given avro schema */
            fieldSchema = validateAndConvert(inputField.schema(),
                                            pigFields[i]);
        }

        /* get field name of output */
        String outname = (isPartialSchema) ? pigFields[i].getName() : inputField.name();
        if (outname == null)
            outname = FIELD_NAME + "_" + i; // field name cannot be null

        /* get doc of output */
        String doc = (isPartialSchema) ? pigFields[i].getDescription() : inputField.doc();

        JsonNode defaultvalue = (inputField != null) ? inputField.defaultValue() : null;

        outFields.add(new Field(outname, fieldSchema, doc, defaultvalue));

    }

    outSchema.setFields(outFields);
    return outSchema;

}
 
Example 17
/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

  Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build();
  int result = cli.runCmd();

  if (result != 0) {
    return result;
  }

  Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT));
  Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT));

  Configuration conf = super.getConf();

  Job job = new Job(conf);
  job.setJarByClass(AvroProjectionParquetMapReduce.class);

  job.setInputFormatClass(AvroParquetInputFormat.class);
  AvroParquetInputFormat.setInputPaths(job, inputPath);

  // predicate pushdown
  AvroParquetInputFormat.setUnboundRecordFilter(job, GoogleStockFilter.class);

  // projection pushdown
  Schema projection = Schema.createRecord(Stock.SCHEMA$.getName(),
      Stock.SCHEMA$.getDoc(), Stock.SCHEMA$.getNamespace(), false);
  List<Schema.Field> fields = Lists.newArrayList();
  for (Schema.Field field : Stock.SCHEMA$.getFields()) {
    if ("symbol".equals(field.name()) || "open".equals(field.name())) {
      fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
          field.defaultValue(), field.order()));
    }
  }
  projection.setFields(fields);
  AvroParquetInputFormat.setRequestedProjection(job, projection);


  job.setMapperClass(Map.class);
  job.setReducerClass(Reduce.class);

  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(DoubleWritable.class);

  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  FileOutputFormat.setOutputPath(job, outputPath);
  AvroParquetOutputFormat.setSchema(job, StockAvg.SCHEMA$);

  return job.waitForCompletion(true) ? 0 : 1;
}
 
Example 18
Source Project: parquet-mr   File: Schemas.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Merges two {@link Schema} instances or returns {@code null}.
 * <p>
 * The two schemas are merged if they are the same type. Records are merged
 * if the two records have the same name or have no names but have a
 * significant number of shared fields.
 * <p>
 * @see {@link #mergeOrUnion} to return a union when a merge is not possible.
 *
 * @param left a {@code Schema}
 * @param right a {@code Schema}
 * @return a merged {@code Schema} or {@code null} if merging is not possible
 */
private static Schema mergeOnly(Schema left, Schema right) {
  if (Objects.equal(left, right)) {
    return left;
  }

  // handle primitive type promotion; doesn't promote integers to floats
  switch (left.getType()) {
    case INT:
      if (right.getType() == Schema.Type.LONG) {
        return right;
      }
      break;
    case LONG:
      if (right.getType() == Schema.Type.INT) {
        return left;
      }
      break;
    case FLOAT:
      if (right.getType() == Schema.Type.DOUBLE) {
        return right;
      }
      break;
    case DOUBLE:
      if (right.getType() == Schema.Type.FLOAT) {
        return left;
      }
  }

  // any other cases where the types don't match must be combined by a union
  if (left.getType() != right.getType()) {
    return null;
  }

  switch (left.getType()) {
    case UNION:
      return union(left, right);
    case RECORD:
      if (left.getName() == null && right.getName() == null &&
          fieldSimilarity(left, right) < SIMILARITY_THRESH) {
        return null;
      } else if (!Objects.equal(left.getName(), right.getName())) {
        return null;
      }

      Schema combinedRecord = Schema.createRecord(
          coalesce(left.getName(), right.getName()),
          coalesce(left.getDoc(), right.getDoc()),
          coalesce(left.getNamespace(), right.getNamespace()),
          false
      );
      combinedRecord.setFields(mergeFields(left, right));

      return combinedRecord;

    case MAP:
      return Schema.createMap(
          mergeOrUnion(left.getValueType(), right.getValueType()));

    case ARRAY:
      return Schema.createArray(
          mergeOrUnion(left.getElementType(), right.getElementType()));

    case ENUM:
      if (!Objects.equal(left.getName(), right.getName())) {
        return null;
      }
      Set<String> symbols = Sets.newLinkedHashSet();
      symbols.addAll(left.getEnumSymbols());
      symbols.addAll(right.getEnumSymbols());
      return Schema.createEnum(
          left.getName(),
          coalesce(left.getDoc(), right.getDoc()),
          coalesce(left.getNamespace(), right.getNamespace()),
          ImmutableList.copyOf(symbols)
      );

    default:
      // all primitives are handled before the switch by the equality check.
      // schemas that reach this point are not primitives and also not any of
      // the above known types.
      throw new UnsupportedOperationException(
          "Unknown schema type: " + left.getType());
  }
}
 
Example 19
public Schema getFullDynamicSchema() {
    Schema emptySchema = Schema.createRecord("dynamic", null, null, false);
    emptySchema.setFields(new ArrayList<Field>());
    emptySchema = AvroUtils.setIncludeAllFields(emptySchema, true);
    return emptySchema;
}
 
Example 20
@Test
public void testReadWrite() throws Exception {

  conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
  final Job job = new Job(conf, "read");
  job.setInputFormatClass(AvroParquetInputFormat.class);
  AvroParquetInputFormat.setInputPaths(job, parquetPath);
  // Test push-down predicates by using an electric car filter
  AvroParquetInputFormat.setUnboundRecordFilter(job, ElectricCarFilter.class);

  // Test schema projection by dropping the optional extras
  Schema projection = Schema.createRecord(CAR_SCHEMA.getName(),
      CAR_SCHEMA.getDoc(), CAR_SCHEMA.getNamespace(), false);
  List<Schema.Field> fields = Lists.newArrayList();
  for (Schema.Field field : ReflectData.get().getSchema(Car.class).getFields()) {
    if (!"optionalExtra".equals(field.name())) {
      fields.add(new Schema.Field(field.name(), field.schema(), field.doc(),
          field.defaultVal(), field.order()));
    }
  }
  projection.setFields(fields);
  AvroParquetInputFormat.setRequestedProjection(job, projection);

  job.setMapperClass(TestReflectInputOutputFormat.MyMapper2.class);
  job.setNumReduceTasks(0);

  job.setOutputFormatClass(AvroParquetOutputFormat.class);
  AvroParquetOutputFormat.setOutputPath(job, outputPath);
  AvroParquetOutputFormat.setSchema(job, CAR_SCHEMA);

  waitForJob(job);

  final Path mapperOutput = new Path(outputPath.toString(),
      "part-m-00000.parquet");
  try(final AvroParquetReader<Car> out = new AvroParquetReader<Car>(conf, mapperOutput)) {
    Car car;
    Car previousCar = null;
    int lineNumber = 0;
    while ((car = out.read()) != null) {
      if (previousCar != null) {
        // Testing reference equality here. The "model" field should be dictionary-encoded.
        assertTrue(car.model == previousCar.model);
      }
      // Make sure that predicate push down worked as expected
      if (car.engine.type == EngineType.PETROL) {
        fail("UnboundRecordFilter failed to remove cars with PETROL engines");
      }
      // Note we use lineNumber * 2 because of predicate push down
      Car expectedCar = nextRecord(lineNumber * 2);
      // We removed the optional extra field using projection so we shouldn't
      // see it here...
      expectedCar.optionalExtra = null;
      assertEquals("line " + lineNumber, expectedCar, car);
      ++lineNumber;
      previousCar = car;
    }
  }
}