Java Code Examples for org.apache.avro.file.DataFileStream#getSchema()

The following examples show how to use org.apache.avro.file.DataFileStream#getSchema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroStorageUtils.java    From Cubert with Apache License 2.0 6 votes vote down vote up
/**
 * This method is called by {@link #getAvroSchema}. The default implementation
 * returns the schema of an avro file; or the schema of the last file in a first-level
 * directory (it does not contain sub-directories).
 *
 * @param path  path of a file or first level directory
 * @param fs  file system
 * @return avro schema
 * @throws IOException
 */
public static Schema getSchema(Path path, FileSystem fs) throws IOException {
    /* get path of the last file */
    Path lastFile = AvroStorageUtils.getLast(path, fs);
    if (lastFile == null) {
        return null;
    }

    /* read in file and obtain schema */
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    InputStream hdfsInputStream = fs.open(lastFile);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema ret = avroDataStream.getSchema();
    avroDataStream.close();

    return ret;
}
 
Example 2
Source File: AvroStorageUtils.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
 * This method is called by {@link #getAvroSchema}. The default implementation
 * returns the schema of an avro file; or the schema of the last file in a first-level
 * directory (it does not contain sub-directories).
 *
 * @param path  path of a file or first level directory
 * @param fs  file system
 * @return avro schema
 * @throws IOException
 */
public static Schema getSchema(Path path, FileSystem fs) throws IOException {
    /* get path of the last file */
    Path lastFile = AvroStorageUtils.getLast(path, fs);
    if (lastFile == null) {
        return null;
    }

    /* read in file and obtain schema */
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    InputStream hdfsInputStream = fs.open(lastFile);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema ret = avroDataStream.getSchema();
    avroDataStream.close();

    return ret;
}
 
Example 3
Source File: PentahoAvroInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@VisibleForTesting
public Schema readAvroSchema() throws Exception {
  if ( useFieldAsSchema ) {
    return new Schema.Parser().parse( ( (String) incomingFields[ determineStringFieldIndex( schemaFieldName ) ] ) );
  } else {
    if ( schemaFileName != null && schemaFileName.length() > 0 ) {
      return new Schema.Parser().parse( KettleVFS.getInputStream( schemaFileName, variableSpace ) );
    } else if ( ( fileName != null && fileName.length() > 0 ) || ( useFieldAsInputStream && inputStream != null ) ) {
      Schema schema;
      DataFileStream<GenericRecord> dataFileStream = createDataFileStream();
      schema = dataFileStream.getSchema();
      dataFileStream.close();
      return schema;
    }
  }
  throw new Exception( "The file you provided does not contain a schema."
    + "  Please choose a schema file, or another file that contains a schema." );
}
 
Example 4
Source File: TestPutHiveStreaming.java    From nifi with Apache License 2.0 5 votes vote down vote up
private void assertOutputAvroRecords(List<Map<String, Object>> expectedRecords, MockFlowFile resultFlowFile) throws IOException {
    assertEquals(String.valueOf(expectedRecords.size()), resultFlowFile.getAttribute(PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR));

    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<GenericRecord>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertTrue(schema.equals(new Schema.Parser().parse(new File("src/test/resources/user.avsc"))));

    GenericRecord record = null;
    for (Map<String, Object> expectedRecord : expectedRecords) {
        assertTrue(reader.hasNext());
        record = reader.next(record);
        final String name = record.get("name").toString();
        final Integer favorite_number = (Integer) record.get("favorite_number");
        assertNotNull(name);
        assertNotNull(favorite_number);
        assertNull(record.get("favorite_color"));
        assertNull(record.get("scale"));

        assertEquals(expectedRecord.get("name"), name);
        assertEquals(expectedRecord.get("favorite_number"), favorite_number);
    }
    assertFalse(reader.hasNext());
}
 
Example 5
Source File: TestPutHive3Streaming.java    From nifi with Apache License 2.0 5 votes vote down vote up
private void assertOutputAvroRecords(List<Map<String, Object>> expectedRecords, MockFlowFile resultFlowFile) throws IOException {
    assertEquals(String.valueOf(expectedRecords.size()), resultFlowFile.getAttribute(PutHive3Streaming.HIVE_STREAMING_RECORD_COUNT_ATTR));

    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertEquals(schema, new Schema.Parser().parse(new File("src/test/resources/user.avsc")));

    GenericRecord record = null;
    for (Map<String, Object> expectedRecord : expectedRecords) {
        assertTrue(reader.hasNext());
        record = reader.next(record);
        final String name = record.get("name").toString();
        final Integer favorite_number = (Integer) record.get("favorite_number");
        assertNotNull(name);
        assertNotNull(favorite_number);
        assertNull(record.get("favorite_color"));
        assertNull(record.get("scale"));

        assertEquals(expectedRecord.get("name"), name);
        assertEquals(expectedRecord.get("favorite_number"), favorite_number);
    }
    assertFalse(reader.hasNext());
}
 
Example 6
Source File: Schemas.java    From kite with Apache License 2.0 5 votes vote down vote up
public static Schema fromAvro(InputStream in) throws IOException {
  GenericDatumReader<GenericRecord> datumReader =
      new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> stream = null;
  boolean threw = true;

  try {
    stream = new DataFileStream<GenericRecord>(in, datumReader);
    Schema schema = stream.getSchema();
    threw = false;
    return schema;
  } finally {
    Closeables.close(stream, threw);
  }
}
 
Example 7
Source File: TestWriteAvroResultWithSchema.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Override
protected List<GenericRecord> readRecords(final InputStream in, final Schema schema, final int recordCount) throws IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(in, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, StringType.String);

    List<GenericRecord> records = new ArrayList<>();
    for (int i = 0; i < recordCount; i++) {
        records.add(dataFileStream.next());
    }

    return records;
}
 
Example 8
Source File: TestWriteAvroResultWithSchema.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Override
protected GenericRecord readRecord(final InputStream in, final Schema schema) throws IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(in, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    return avroRecord;
}
 
Example 9
Source File: EmbeddedAvroSchemaAccessStrategy.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Override
public RecordSchema getSchema(Map<String, String> variables, final InputStream contentStream, final RecordSchema readSchema) throws SchemaNotFoundException, IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(contentStream, new GenericDatumReader<GenericRecord>());
    final Schema avroSchema = dataFileStream.getSchema();
    final RecordSchema recordSchema = AvroTypeUtil.createSchema(avroSchema);
    return recordSchema;
}
 
Example 10
Source File: BlocksTest.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
@BeforeClass
  public static void before()
      throws Exception {
    final String filePath = TestUtils.getFileFromResourceUrl(BlocksTest.class.getClassLoader().getResource(AVRO_DATA));
    if (INDEX_DIR.exists()) {
      FileUtils.deleteQuietly(INDEX_DIR);
    }

//    System.out.println(INDEX_DIR.getAbsolutePath());
    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);

    final SegmentGeneratorConfig config = SegmentTestUtils
        .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", TimeUnit.DAYS,
            "test");
    config.setTimeColumnName("daysSinceEpoch");
    driver.init(config);
    driver.build();

    final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
    final org.apache.avro.Schema avroSchema = avroReader.getSchema();
    final String[] columns = new String[avroSchema.getFields().size()];
    int i = 0;
    for (final Field f : avroSchema.getFields()) {
      columns[i] = f.name();
      i++;
    }
  }
 
Example 11
Source File: TestHDFSEventSink.java    From mt-flume with Apache License 2.0 5 votes vote down vote up
private void verifyOutputAvroFiles(FileSystem fs, Configuration conf, String dir, String prefix, List<String> bodies) throws IOException {
  int found = 0;
  int expected = bodies.size();
  for(String outputFile : getAllFiles(dir)) {
    String name = (new File(outputFile)).getName();
    if(name.startsWith(prefix)) {
      FSDataInputStream input = fs.open(new Path(outputFile));
      DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
      DataFileStream<GenericRecord> avroStream =
          new DataFileStream<GenericRecord>(input, reader);
      GenericRecord record = new GenericData.Record(avroStream.getSchema());
      while (avroStream.hasNext()) {
        avroStream.next(record);
        ByteBuffer body = (ByteBuffer) record.get("body");
        CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
        String bodyStr = decoder.decode(body).toString();
        LOG.debug("Removing event: {}", bodyStr);
        bodies.remove(bodyStr);
        found++;
      }
      avroStream.close();
      input.close();
    }
  }
  Assert.assertTrue("Found = " + found + ", Expected = "  +
      expected + ", Left = " + bodies.size() + " " + bodies,
        bodies.size() == 0);
}
 
Example 12
Source File: PathUtils.java    From datafu with Apache License 2.0 5 votes vote down vote up
/**
 * Gets the schema from a given Avro data file.
 * 
 * @param fs the filesystem
 * @param path path to get schema from
 * @return The schema read from the data file's metadata.
 * @throws IOException IOException
 */
public static Schema getSchemaFromFile(FileSystem fs, Path path) throws IOException
{
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  try
  {
    return dataFileStream.getSchema();
  }
  finally
  {
    dataFileStream.close();
  }
}
 
Example 13
Source File: AvroStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the avro schemas at the specified location.
 * @param p Location of file
 * @param job Hadoop job object
 * @return an Avro Schema object derived from the specified file
 * @throws IOException
 *
 */
public Schema getAvroSchema(final Path[] p, final Job job) throws IOException {
  GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
  ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
  FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
  for (Path temp : p) {
    for (FileStatus tempf : fs.globStatus(temp)) {
      statusList.add(tempf);
    }
  }
  FileStatus[] statusArray = (FileStatus[]) statusList
      .toArray(new FileStatus[statusList.size()]);

  if (statusArray == null) {
    throw new IOException("Path " + p.toString() + " does not exist.");
  }

  if (statusArray.length == 0) {
    throw new IOException("No path matches pattern " + p.toString());
  }

  Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

  if (filePath == null) {
    throw new IOException("No path matches pattern " + p.toString());
  }

  InputStream hdfsInputStream = fs.open(filePath);
  DataFileStream<Object> avroDataStream = new DataFileStream<Object>(
      hdfsInputStream, avroReader);
  Schema s = avroDataStream.getSchema();
  avroDataStream.close();
  return s;
}
 
Example 14
Source File: Schemas.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static Schema fromAvro(InputStream in) throws IOException {
  GenericDatumReader<GenericRecord> datumReader =
      new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> stream = null;
  boolean threw = true;

  try {
    stream = new DataFileStream<>(in, datumReader);
    Schema schema = stream.getSchema();
    threw = false;
    return schema;
  } finally {
    Closeables.close(stream, threw);
  }
}
 
Example 15
Source File: AvroUtils.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
/**
 * Loads the schema from an Avro data file.
 * 
 * @param conf The JobConf.
 * @param path The path to the data file.
 * @return The schema read from the data file's metadata.
 * @throws IOException
 */
public static Schema getSchemaFromFile(JobConf conf, Path path) throws IOException
{
  FileSystem fs = path.getFileSystem(new Configuration());
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  return dataFileStream.getSchema();
}
 
Example 16
Source File: SplitAvro.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Override
public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException {
    writer = new GenericDatumWriter<>(reader.getSchema());
    encoder = EncoderFactory.get().binaryEncoder(out, null);
}
 
Example 17
Source File: TestExecuteSQLRecord.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void testWriteLOBsToAvro() throws Exception {
    final DBCPService dbcp = new DBCPServiceSimpleImpl("h2");
    final Map<String, String> dbcpProperties = new HashMap<>();

    runner = TestRunners.newTestRunner(ExecuteSQLRecord.class);
    runner.addControllerService("dbcp", dbcp, dbcpProperties);
    runner.enableControllerService(dbcp);
    runner.setProperty(AbstractExecuteSQL.DBCP_SERVICE, "dbcp");

    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();

    // load test data to database
    final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();

    try {
        stmt.execute("drop table TEST_NULL_INT");
    } catch (final SQLException sqle) {
    }

    stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, image blob(1K), words clob(1K), "
            + "natwords nclob(1K), constraint my_pk primary key (id))");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2, image, words, natwords) VALUES (0, NULL, 1, CAST (X'DEADBEEF' AS BLOB), "
            + "CAST ('Hello World' AS CLOB), CAST ('I am an NCLOB' AS NCLOB))");

    runner.setIncomingConnection(false);
    runner.setProperty(AbstractExecuteSQL.SQL_SELECT_QUERY, "select * from TEST_NULL_INT");
    AvroRecordSetWriter recordWriter = new AvroRecordSetWriter();
    runner.addControllerService("writer", recordWriter);
    runner.setProperty(recordWriter, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.INHERIT_RECORD_SCHEMA);
    runner.setProperty(ExecuteSQLRecord.RECORD_WRITER_FACTORY, "writer");
    runner.enableControllerService(recordWriter);
    runner.run();

    runner.assertAllFlowFilesTransferred(AbstractExecuteSQL.REL_SUCCESS, 1);
    MockFlowFile flowFile = runner.getFlowFilesForRelationship(AbstractExecuteSQL.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals(AbstractExecuteSQL.RESULT_ROW_COUNT, "1");

    ByteArrayInputStream bais = new ByteArrayInputStream(flowFile.toByteArray());
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(bais, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, GenericData.StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    Object imageObj = avroRecord.get("IMAGE");
    assertNotNull(imageObj);
    assertTrue(imageObj instanceof ByteBuffer);
    assertArrayEquals(new byte[]{(byte) 0xDE, (byte) 0xAD, (byte) 0xBE, (byte) 0xEF}, ((ByteBuffer) imageObj).array());

    Object wordsObj = avroRecord.get("WORDS");
    assertNotNull(wordsObj);
    assertTrue(wordsObj instanceof Utf8);
    assertEquals("Hello World", wordsObj.toString());

    Object natwordsObj = avroRecord.get("NATWORDS");
    assertNotNull(natwordsObj);
    assertTrue(natwordsObj instanceof Utf8);
    assertEquals("I am an NCLOB", natwordsObj.toString());
}
 
Example 18
Source File: WholeFileTransformerProcessor.java    From datacollector with Apache License 2.0 4 votes vote down vote up
/**
 * Convert Avro record to Parquet
 * @param sourceFileName the source Avro file name
 * @param fileReader the {@link org.apache.avro.file.DataFileStream} Avro file reader
 * @param tempParquetFile the {@link java.nio.file.Path} temporary parquet file path
 */
private void writeParquet(String sourceFileName, DataFileStream<GenericRecord> fileReader, Path tempParquetFile) throws StageException {
  long recordCount = 0;
  GenericRecord avroRecord;
  Schema schema = fileReader.getSchema();

  LOG.debug("Start reading input file : {}", sourceFileName);
  try {
    // initialize parquet writer
    Configuration jobConfiguration = new Configuration();
    String compressionCodecName = compressionElEval.eval(variables, jobConfig.avroParquetConfig.compressionCodec, String.class);
    jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, compressionCodecName);
    jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize);
    jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize);
    jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize);
    jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize);

    // Parquet writer
    ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(
        new org.apache.hadoop.fs.Path(tempParquetFile.toString()),
        schema,
        jobConfiguration
    );
    parquetWriter = builder.build();

    while (fileReader.hasNext()) {
      avroRecord = fileReader.next();
      parquetWriter.write(avroRecord);
      recordCount++;
    }
    parquetWriter.close();

  } catch (IOException ex) {
    throw new TransformerStageCheckedException(
        Errors.CONVERT_08,
        sourceFileName,
        recordCount,
        ex
    );
  }
  LOG.debug("Finished writing {} records to {}", recordCount, tempParquetFile.getFileName());
}
 
Example 19
Source File: TestPutHiveStreaming.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void onTriggerMultipleRecords() throws Exception {
    runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
    runner.setProperty(PutHiveStreaming.DB_NAME, "default");
    runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
    runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "2");
    runner.setValidateExpressionUsage(false);
    Map<String, Object> user1 = new HashMap<String, Object>() {
        {
            put("name", "Joe");
            put("favorite_number", 146);
        }
    };
    Map<String, Object> user2 = new HashMap<String, Object>() {
        {
            put("name", "Mary");
            put("favorite_number", 42);
        }
    };
    Map<String, Object> user3 = new HashMap<String, Object>() {
        {
            put("name", "Matt");
            put("favorite_number", 3);
        }
    };
    runner.enqueue(createAvroRecord(Arrays.asList(user1, user2, user3)));
    runner.run();

    runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 1);
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(PutHiveStreaming.REL_SUCCESS).get(0);
    assertNotNull(resultFlowFile);
    assertEquals("3", resultFlowFile.getAttribute(PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR));
    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<GenericRecord>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertTrue(schema.equals(new Schema.Parser().parse(new File("src/test/resources/user.avsc"))));

    // Verify the records are intact. We can't guarantee order so check the total number and non-null fields
    assertTrue(reader.hasNext());
    GenericRecord record = reader.next(null);
    assertNotNull(record.get("name"));
    assertNotNull(record.get("favorite_number"));
    assertNull(record.get("favorite_color"));
    assertNull(record.get("scale"));
    assertTrue(reader.hasNext());
    record = reader.next(record);
    assertTrue(reader.hasNext());
    reader.next(record);
    assertFalse(reader.hasNext());
}
 
Example 20
Source File: ThirdeyeAvroUtils.java    From incubator-pinot with Apache License 2.0 3 votes vote down vote up
/**
 * extracts avro schema from avro file
 * @param avroFile
 * @return
 * @throws FileNotFoundException
 * @throws IOException
 */
public static Schema extractSchemaFromAvro(Path avroFile) throws IOException {
  DataFileStream<GenericRecord> dataStreamReader = getAvroReader(avroFile);
  Schema avroSchema = dataStreamReader.getSchema();
  dataStreamReader.close();
  return avroSchema;
}