Java Code Examples for org.apache.avro.file.DataFileReader#getSchema()

The following examples show how to use org.apache.avro.file.DataFileReader#getSchema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroToDdlTool.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws IOException {
  if (args.length == 0) {
    System.out.println("Please specify the avro files");
    System.exit(1);
  }

  List<Schema> schemaList = new ArrayList<>();
  for (String filePath : args) {
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> dataFileReader =
        new DataFileReader<>(new File(filePath), datumReader);
    Schema schema = dataFileReader.getSchema();
    System.out.println(schema.toString(true));
    schemaList.add(schema);
  }
  Ddl ddl = new AvroSchemaToDdlConverter().toDdl(schemaList);
  ddl.prettyPrint(System.out);
}
 
Example 2
Source File: TestAvroEventSerializer.java    From mt-flume with Apache License 2.0 6 votes vote down vote up
public void validateAvroFile(File file) throws IOException {
  // read the events back using GenericRecord
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileReader<GenericRecord> fileReader =
      new DataFileReader<GenericRecord>(file, reader);
  GenericRecord record = new GenericData.Record(fileReader.getSchema());
  int numEvents = 0;
  while (fileReader.hasNext()) {
    fileReader.next(record);
    String bodyStr = record.get("message").toString();
    System.out.println(bodyStr);
    numEvents++;
  }
  fileReader.close();
  Assert.assertEquals("Should have found a total of 3 events", 3, numEvents);
}
 
Example 3
Source File: TestAvroImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testOverrideTypeMapping() throws IOException {
  String [] types = { "INT" };
  String [] vals = { "10" };
  createTableWithColTypes(types, vals);

  String [] extraArgs = { "--map-column-java", "DATA_COL0=String"};

  runImport(getOutputArgv(true, extraArgs));

  Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
  DataFileReader<GenericRecord> reader = read(outputFile);
  Schema schema = reader.getSchema();
  assertEquals(Schema.Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());

  checkField(fields.get(0), "DATA_COL0", Schema.Type.STRING);

  GenericRecord record1 = reader.next();
  assertEquals("DATA_COL0", new Utf8("10"), record1.get("DATA_COL0"));
}
 
Example 4
Source File: TestAvroImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testFirstUnderscoreInColumnName() throws IOException {
  String [] names = { "_NAME" };
  String [] types = { "INT" };
  String [] vals = { "1987" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
  DataFileReader<GenericRecord> reader = read(outputFile);
  Schema schema = reader.getSchema();
  assertEquals(Schema.Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());

  checkField(fields.get(0), "__NAME", Type.INT);

  GenericRecord record1 = reader.next();
  assertEquals("__NAME", 1987, record1.get("__NAME"));
}
 
Example 5
Source File: TestAvroImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testNonstandardCharactersInColumnName() throws IOException {
  String [] names = { "avro\uC3A11" };
  String [] types = { "INT" };
  String [] vals = { "1987" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
  DataFileReader<GenericRecord> reader = read(outputFile);
  Schema schema = reader.getSchema();
  assertEquals(Schema.Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());

  checkField(fields.get(0), "AVRO1", Type.INT);

  GenericRecord record1 = reader.next();
  assertEquals("AVRO1", 1987, record1.get("AVRO1"));
}
 
Example 6
Source File: TestAvroImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
public void testNonIdentCharactersInColumnName() throws IOException {
  String [] names = { "test_a-v+r/o" };
  String [] types = { "INT" };
  String [] vals = { "2015" };
  createTableWithColTypesAndNames(names, types, vals);

  runImport(getOutputArgv(true, null));

  Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
  DataFileReader<GenericRecord> reader = read(outputFile);
  Schema schema = reader.getSchema();
  assertEquals(Schema.Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());

  checkField(fields.get(0), "TEST_A_V_R_O", Type.INT);

  GenericRecord record1 = reader.next();
  assertEquals("TEST_A_V_R_O", 2015, record1.get("TEST_A_V_R_O"));
}
 
Example 7
Source File: TestFlumeEventAvroEventSerializer.java    From mt-flume with Apache License 2.0 6 votes vote down vote up
public void validateAvroFile(File file) throws IOException {
  // read the events back using GenericRecord
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileReader<GenericRecord> fileReader =
      new DataFileReader<GenericRecord>(file, reader);
  GenericRecord record = new GenericData.Record(fileReader.getSchema());
  int numEvents = 0;
  while (fileReader.hasNext()) {
    fileReader.next(record);
    ByteBuffer body = (ByteBuffer) record.get("body");
    CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
    String bodyStr = decoder.decode(body).toString();
    System.out.println(bodyStr);
    numEvents++;
  }
  fileReader.close();
  Assert.assertEquals("Should have found a total of 3 events", 3, numEvents);
}
 
Example 8
Source File: BinaryAvroSchemaFileReader.java    From pxf with Apache License 2.0 6 votes vote down vote up
@Override
public Schema readSchema(Configuration configuration, String schemaName, HcfsType hcfsType, AvroUtilities.FileSearcher fileSearcher) throws IOException {
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    DataFileReader<GenericRecord> fileReader = null;

    try {
        File file = fileSearcher.searchForFile(schemaName);
        if (file == null) {
            final Path path = new Path(hcfsType.getDataUri(configuration, schemaName));
            FsInput inStream = new FsInput(path, configuration);
            fileReader = new DataFileReader<>(inStream, datumReader);
        } else {
            fileReader = new DataFileReader<>(file, datumReader);
        }
        return fileReader.getSchema();
    } finally {
        if (fileReader != null) {
            fileReader.close();
        }
    }
}
 
Example 9
Source File: FileAwareInputStreamExtractorWithCheckSchema.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Use {@link AvroSchemaCheckStrategy} to make sure the real schema and the expected schema have matching field names and types
 * @param fsFromFile
 * @return
 * @throws IOException
 */
protected boolean schemaChecking(FileSystem fsFromFile) throws IOException {
  if( !this.state.getPropAsBoolean(CopySource.SCHEMA_CHECK_ENABLED, CopySource.DEFAULT_SCHEMA_CHECK_ENABLED) ) {
    return true;
  }
  DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
  DataFileReader<GenericRecord> dataFileReader =
      new DataFileReader(new FsInput(this.file.getFileStatus().getPath(), new Configuration()), datumReader);
  Schema schema = dataFileReader.getSchema();
  if(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA) == null) {
    throw new IOException("Expected schema is not set properly");
  }
  Schema expectedSchema = new Schema.Parser().parse(this.state.getProp(ConfigurationKeys.COPY_EXPECTED_SCHEMA));
  AvroSchemaCheckStrategy strategy = AvroSchemaCheckStrategy.AvroSchemaCheckStrategyFactory.create(this.state);
  if(strategy == null) {
    throw new IOException("schema check strategy cannot be initialized");
  }
  return strategy.compare(expectedSchema,schema);
}
 
Example 10
Source File: AvroUtils.java    From Cubert with Apache License 2.0 5 votes vote down vote up
public static Schema getSchema(SeekableInput input) throws IOException
{
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> dataFileReader =
            new DataFileReader<GenericRecord>(input, datumReader);
    Schema schema = dataFileReader.getSchema();

    if (PadDefaultNullsToSchema)
    {
        // a list of "cloned" fields, with optional default value set to null
        ArrayList<Field> paddedFields = new ArrayList<Field>();

        for (Field field: schema.getFields())
        {
            // should this field be padded?
            boolean needsNullPadding = (field.schema() != null) // the field has nested schema
                && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION
                && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type

            JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue();

            Field f = new Field(field.name(), field.schema(), field.doc(), defValue);
            paddedFields.add(f);
        }

        schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
        schema.setFields(paddedFields);
    }

    return schema;
}
 
Example 11
Source File: Purge.java    From Cubert with Apache License 2.0 5 votes vote down vote up
private DataFileWriter<GenericRecord> createDataFileWriter(DataFileReader<GenericRecord> dataFileReader) throws IllegalArgumentException,
        IOException
{
    Schema schema = dataFileReader.getSchema();
    DatumWriter<GenericRecord> datumWriter =
            new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> writer =
            new DataFileWriter<GenericRecord>(datumWriter);

    // Get the codec of the reader
    String codecStr = dataFileReader.getMetaString(DataFileConstants.CODEC);
    int level = conf.getInt("avro.mapred.deflate.level", 1);
    String codecName = conf.get("avro.output.codec", codecStr);
    CodecFactory factory =
            codecName.equals("deflate") ? CodecFactory.deflateCodec(level)
                    : CodecFactory.fromString(codecName);

    // Set the codec of the writer
    writer.setCodec(factory);

    writer.setSyncInterval(conf.getInt("avro.mapred.sync.interval",
                                       Math.max(conf.getInt("io.file.buffer.size",
                                                            16000), 16000)));

    writer.create(schema,
                  new Path(tempFileName).getFileSystem(conf)
                                        .create(new Path(tempFileName)));
    return writer;
}
 
Example 12
Source File: AvroHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testWrite() throws IOException {
  // Write all test records
  for (String record : TestConstants.JSON_RECORDS) {
    this.writer.write(convertRecord(record));
  }

  Assert.assertEquals(this.writer.recordsWritten(), 3);

  this.writer.close();
  this.writer.commit();

  File outputFile =
      new File(TestConstants.TEST_OUTPUT_DIR + Path.SEPARATOR + this.filePath, TestConstants.TEST_FILE_NAME);
  DataFileReader<GenericRecord> reader =
      new DataFileReader<>(outputFile, new GenericDatumReader<GenericRecord>());
  Schema fileSchema = reader.getSchema();
  Assert.assertEquals(fileSchema.getProp(TEST_PROPERTY_KEY), TEST_PROPERTY_VALUE);

  // Read the records back and assert they are identical to the ones written
  GenericRecord user1 = reader.next();
  // Strings are in UTF8, so we have to call toString() here and below
  Assert.assertEquals(user1.get("name").toString(), "Alyssa");
  Assert.assertEquals(user1.get("favorite_number"), 256);
  Assert.assertEquals(user1.get("favorite_color").toString(), "yellow");

  GenericRecord user2 = reader.next();
  Assert.assertEquals(user2.get("name").toString(), "Ben");
  Assert.assertEquals(user2.get("favorite_number"), 7);
  Assert.assertEquals(user2.get("favorite_color").toString(), "red");

  GenericRecord user3 = reader.next();
  Assert.assertEquals(user3.get("name").toString(), "Charlie");
  Assert.assertEquals(user3.get("favorite_number"), 68);
  Assert.assertEquals(user3.get("favorite_color").toString(), "blue");

  reader.close();

  FsWriterMetrics metrics = FsWriterMetrics.fromJson(properties.getProp(FsDataWriter.FS_WRITER_METRICS_KEY));
  Assert.assertEquals(metrics.fileInfos.size(),1);
  FsWriterMetrics.FileInfo fileInfo = metrics.fileInfos.iterator().next();

  Assert.assertEquals(fileInfo.fileName, TestConstants.TEST_FILE_NAME);
  Assert.assertEquals(fileInfo.numRecords, 3);
  Assert.assertNull(metrics.partitionInfo.partitionKey);
  Assert.assertEquals(metrics.partitionInfo.branchId, 0);
}
 
Example 13
Source File: TestSyslogAvroEventSerializer.java    From mt-flume with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws FileNotFoundException, IOException {
  // Snappy currently broken on Mac in OpenJDK 7 per FLUME-2012
  Assume.assumeTrue(!"Mac OS X".equals(System.getProperty("os.name")) ||
    !System.getProperty("java.version").startsWith("1.7."));

  //Schema schema = new Schema.Parser().parse(schemaFile);

  // create the file, write some data
  OutputStream out = new FileOutputStream(testFile);
  String builderName = SyslogAvroEventSerializer.Builder.class.getName();

  Context ctx = new Context();
  ctx.put("syncInterval", "4096");
  ctx.put("compressionCodec", "snappy");

  EventSerializer serializer =
      EventSerializerFactory.getInstance(builderName, ctx, out);
  serializer.afterCreate(); // must call this when a file is newly created

  List<Event> events = generateSyslogEvents();
  for (Event e : events) {
    serializer.write(e);
  }
  serializer.flush();
  serializer.beforeClose();
  out.flush();
  out.close();

  // now try to read the file back

  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileReader<GenericRecord> fileReader =
      new DataFileReader<GenericRecord>(testFile, reader);

  GenericRecord record = new GenericData.Record(fileReader.getSchema());
  int numEvents = 0;
  while (fileReader.hasNext()) {
    fileReader.next(record);
    int facility = (Integer) record.get("facility");
    int severity = (Integer) record.get("severity");
    long timestamp = (Long) record.get("timestamp");
    String hostname = record.get("hostname").toString();
    String message = record.get("message").toString();

    Assert.assertEquals("Facility should be 1", 1, facility);
    System.out.println(timestamp + ": " + message);
    numEvents++;
  }

  fileReader.close();
  Assert.assertEquals("Should have found a total of 3 events", 3, numEvents);

  FileUtils.forceDelete(testFile);
}
 
Example 14
Source File: TestAvroImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 4 votes vote down vote up
/**
 * Helper method that runs an import using Avro with optional command line
 * arguments and checks that the created file matches the expectations.
 * <p/>
 * This can be used to test various extra options that are implemented for
 * the Avro input.
 *
 * @param extraArgs extra command line arguments to pass to Sqoop in addition
 *                  to those that {@link #getOutputArgv(boolean, String[])}
 *                  returns
 */
private void avroImportTestHelper(String[] extraArgs, String codec)
  throws IOException {
  String[] types =
    {"BIT", "INTEGER", "BIGINT", "REAL", "DOUBLE", "VARCHAR(6)",
      "VARBINARY(2)", };
  String[] vals = {"true", "100", "200", "1.0", "2.0", "'s'", "'0102'", };
  createTableWithColTypes(types, vals);

  runImport(getOutputArgv(true, extraArgs));

  Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
  DataFileReader<GenericRecord> reader = read(outputFile);
  Schema schema = reader.getSchema();
  assertEquals(Schema.Type.RECORD, schema.getType());
  List<Field> fields = schema.getFields();
  assertEquals(types.length, fields.size());

  checkField(fields.get(0), "DATA_COL0", Schema.Type.BOOLEAN);
  checkField(fields.get(1), "DATA_COL1", Schema.Type.INT);
  checkField(fields.get(2), "DATA_COL2", Schema.Type.LONG);
  checkField(fields.get(3), "DATA_COL3", Schema.Type.FLOAT);
  checkField(fields.get(4), "DATA_COL4", Schema.Type.DOUBLE);
  checkField(fields.get(5), "DATA_COL5", Schema.Type.STRING);
  checkField(fields.get(6), "DATA_COL6", Schema.Type.BYTES);

  GenericRecord record1 = reader.next();
  assertEquals("DATA_COL0", true, record1.get("DATA_COL0"));
  assertEquals("DATA_COL1", 100, record1.get("DATA_COL1"));
  assertEquals("DATA_COL2", 200L, record1.get("DATA_COL2"));
  assertEquals("DATA_COL3", 1.0f, record1.get("DATA_COL3"));
  assertEquals("DATA_COL4", 2.0, record1.get("DATA_COL4"));
  assertEquals("DATA_COL5", new Utf8("s"), record1.get("DATA_COL5"));
  Object object = record1.get("DATA_COL6");
  assertTrue(object instanceof ByteBuffer);
  ByteBuffer b = ((ByteBuffer) object);
  assertEquals((byte) 1, b.get(0));
  assertEquals((byte) 2, b.get(1));

  if (codec != null) {
    assertEquals(codec, reader.getMetaString(DataFileConstants.CODEC));
  }

  checkSchemaFile(schema);
}
 
Example 15
Source File: TestJavaAvroEventSerializer.java    From flume-plugins with MIT License 4 votes vote down vote up
@Test
public void test() throws FileNotFoundException, IOException {

    // create the file, write some data
    OutputStream out = new FileOutputStream(testFile);
    String builderName = JavaLogAvroEventSerializer.Builder.class.getName();

    Context ctx = new Context();
    ctx.put("syncInterval", "4096");

    EventSerializer serializer =
            EventSerializerFactory.getInstance(builderName, ctx, out);
    serializer.afterCreate(); // must call this when a file is newly created

    List<Event> events = generateJavaEvents();
    for (Event e : events) {
        serializer.write(e);
    }
    serializer.flush();
    serializer.beforeClose();
    out.flush();
    out.close();

    // now try to read the file back

    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> fileReader =
            new DataFileReader<GenericRecord>(testFile, reader);

    GenericRecord record = new GenericData.Record(fileReader.getSchema());
    int numEvents = 0;
    while (fileReader.hasNext()) {
        fileReader.next(record);
        long timestamp = (Long) record.get("timestamp");
        String datetime = record.get("datetime").toString();
        String classname = record.get("classname").toString();
        String message = record.get("message").toString();

        System.out.println(classname + ": " + message + " (at " + datetime + ")");
        numEvents++;
    }

    fileReader.close();
    Assert.assertEquals("Should have found a total of 4 events", 4, numEvents);

    FileUtils.forceDelete(testFile);
}
 
Example 16
Source File: TestSyslogAvroEventSerializer.java    From flume-plugins with MIT License 4 votes vote down vote up
@Test
public void test() throws FileNotFoundException, IOException {

    // create the file, write some data
    OutputStream out = new FileOutputStream(testFile);
    String builderName = SyslogAvroEventSerializer.Builder.class.getName();

    Context ctx = new Context();
    ctx.put("syncInterval", "4096");
    ctx.put("path", "src/test/resources/customerToHostsFile.txt");

    EventSerializer serializer =
            EventSerializerFactory.getInstance(builderName, ctx, out);
    serializer.afterCreate(); // must call this when a file is newly created

    List<Event> events = generateSyslogEvents();
    for (Event e : events) {
        serializer.write(e);
    }
    serializer.flush();
    serializer.beforeClose();
    out.flush();
    out.close();

    // now try to read the file back

    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> fileReader =
            new DataFileReader<GenericRecord>(testFile, reader);

    GenericRecord record = new GenericData.Record(fileReader.getSchema());
    int numEvents = 0;
    while (fileReader.hasNext()) {
        fileReader.next(record);
        long timestamp = (Long) record.get("timestamp");
        String datetime = record.get("datetime").toString();
        String hostname = record.get("hostname").toString();
        Map<String, String> headers = (Map<String, String>) record.get("headers");
        String message = record.get("message").toString();

        System.out.println(hostname + " (" + headers + ")" + ": " + message);
        numEvents++;
    }

    fileReader.close();
    Assert.assertEquals("Should have found a total of 6 events", 6, numEvents);

    FileUtils.forceDelete(testFile);
}
 
Example 17
Source File: TestApacheAvroEventSerializer.java    From flume-plugins with MIT License 4 votes vote down vote up
@Test
public void test() throws FileNotFoundException, IOException {

    // create the file, write some data
    OutputStream out = new FileOutputStream(testFile);
    String builderName = ApacheLogAvroEventSerializer.Builder.class.getName();

    Context ctx = new Context();
    ctx.put("syncInterval", "4096");

    EventSerializer serializer =
            EventSerializerFactory.getInstance(builderName, ctx, out);
    serializer.afterCreate(); // must call this when a file is newly created

    List<Event> events = generateApacheEvents();
    for (Event e : events) {
        serializer.write(e);
    }
    serializer.flush();
    serializer.beforeClose();
    out.flush();
    out.close();

    // now try to read the file back

    DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> fileReader =
            new DataFileReader<GenericRecord>(testFile, reader);

    GenericRecord record = new GenericData.Record(fileReader.getSchema());
    int numEvents = 0;
    while (fileReader.hasNext()) {
        fileReader.next(record);
        String ip = record.get("ip").toString();
        String uri = record.get("uri").toString();
        Integer statuscode = (Integer) record.get("statuscode");
        String original = record.get("original").toString();
        String connectionstatus = record.get("connectionstatus").toString();

        Assert.assertEquals("Ip should be 80.79.194.3", "80.79.194.3", ip);
        System.out.println("IP " + ip + " requested: " + uri + " with status code " + statuscode + " and connectionstatus: " + connectionstatus);
        System.out.println("Original logline: " + original);
        numEvents++;
    }

    fileReader.close();
    Assert.assertEquals("Should have found a total of 3 events", 2, numEvents);

    FileUtils.forceDelete(testFile);
}