Java Code Examples for org.apache.avro.file.DataFileStream

The following examples show how to use org.apache.avro.file.DataFileStream. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
/**
 * Finds the avro file in the input folder, and returns its avro schema
 * @param inputPathDir Path to input directory
 * @return Input schema
 * @throws IOException exception when accessing to IO
 */
private Schema getSchema(Path inputPathDir)
    throws IOException {
  FileSystem fs = FileSystem.get(new Configuration());
  Schema avroSchema = null;
  for (FileStatus fileStatus : fs.listStatus(inputPathDir)) {
    if (fileStatus.isFile() && fileStatus.getPath().getName().endsWith(".avro")) {
      _logger.info("Extracting schema from " + fileStatus.getPath());
      try (DataFileStream<GenericRecord> dataStreamReader = getAvroReader(inputPathDir)) {
        avroSchema = dataStreamReader.getSchema();
      }
      break;
    }
  }
  return avroSchema;
}
 
Example 2
private LongColumnPreIndexStatsCollector getTimeColumnStatsCollector(Schema schema, File localAvroFile)
    throws FileNotFoundException, IOException {
  String timeColumnName = schema.getTimeColumnName();
  FieldSpec spec =  schema.getTimeFieldSpec();
  LOGGER.info("Spec for " + timeColumnName + " is " + spec);
  LongColumnPreIndexStatsCollector timeColumnStatisticsCollector = new LongColumnPreIndexStatsCollector(spec.getName(), new StatsCollectorConfig(schema, null));
  LOGGER.info("StatsCollector :" + timeColumnStatisticsCollector);
  DataFileStream<GenericRecord> dataStream =
      new DataFileStream<GenericRecord>(new FileInputStream(localAvroFile), new GenericDatumReader<GenericRecord>());
  while (dataStream.hasNext()) {
    GenericRecord next = dataStream.next();
    timeColumnStatisticsCollector.collect(next.get(timeColumnName));
  }
  dataStream.close();
  timeColumnStatisticsCollector.seal();

  return timeColumnStatisticsCollector;
}
 
Example 3
Source Project: localization_nifi   Source File: TestSplitAvro.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}
 
Example 4
Source Project: nifi   Source File: TestSelectHive3QL.java    License: Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example 5
Source Project: localization_nifi   Source File: PutHiveStreaming.java    License: Apache License 2.0 6 votes vote down vote up
private void appendRecordsToFlowFile(ProcessSession session,
                                     List<HiveStreamingRecord> records,
                                     AtomicReference<FlowFile> appendFlowFile,
                                     DataFileWriter<GenericRecord> avroWriter,
                                     DataFileStream<GenericRecord> reader) throws IOException {

    appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> {

        try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) {
            for (HiveStreamingRecord sRecord : records) {
                writer.append(sRecord.getRecord());
            }
            writer.flush();
        }
    }));
}
 
Example 6
Source Project: nifi   Source File: PutHiveStreaming.java    License: Apache License 2.0 6 votes vote down vote up
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}
 
Example 7
private DataFileStream<Object> createNestedDataFileStream() throws Exception {
  DatumReader<Object> datumReader;
  if ( useFieldAsInputStream ) {
    datumReader = new GenericDatumReader<Object>();
    inputStream.reset();
    return new DataFileStream<Object>( inputStream, datumReader );
  }
  if ( schemaFileName != null && schemaFileName.length() > 0 ) {
    Schema schema = new Schema.Parser().parse( KettleVFS.getInputStream( schemaFileName, variableSpace ) );
    datumReader = new GenericDatumReader<Object>( schema );
  } else {
    datumReader = new GenericDatumReader<Object>();
  }
  FileObject fileObject = KettleVFS.getFileObject( fileName, variableSpace );
  if ( fileObject.isFile() ) {
    this.inputStream = fileObject.getContent().getInputStream();
    return new DataFileStream<>( inputStream, datumReader );
  } else {
    FileObject[] avroFiles = fileObject.findFiles( new FileExtensionSelector( "avro" ) );
    if ( !Utils.isEmpty( avroFiles ) ) {
      this.inputStream = avroFiles[ 0 ].getContent().getInputStream();
      return new DataFileStream<>( inputStream, datumReader );
    }
    return null;
  }
}
 
Example 8
Source Project: nifi   Source File: TestJdbcCommonConvertToAvro.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testConvertToAvroStreamForNumbers() throws SQLException, IOException {
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(testParams.sqlType);
    when(metadata.isSigned(1)).thenReturn(testParams.signed);
    when(metadata.getPrecision(1)).thenReturn(testParams.precision);
    when(metadata.getColumnName(1)).thenReturn("t_int");
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final int ret = 0;
    when(rs.getObject(Mockito.anyInt())).thenReturn(ret);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Integer.toString(ret), record.get("t_int").toString());
        }
    }
}
 
Example 9
Source Project: Cubert   Source File: AvroStorageUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * This method is called by {@link #getAvroSchema}. The default implementation
 * returns the schema of an avro file; or the schema of the last file in a first-level
 * directory (it does not contain sub-directories).
 *
 * @param path  path of a file or first level directory
 * @param fs  file system
 * @return avro schema
 * @throws IOException
 */
public static Schema getSchema(Path path, FileSystem fs) throws IOException {
    /* get path of the last file */
    Path lastFile = AvroStorageUtils.getLast(path, fs);
    if (lastFile == null) {
        return null;
    }

    /* read in file and obtain schema */
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    InputStream hdfsInputStream = fs.open(lastFile);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema ret = avroDataStream.getSchema();
    avroDataStream.close();

    return ret;
}
 
Example 10
Source Project: nifi   Source File: TestSplitAvro.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}
 
Example 11
Source Project: nifi   Source File: TestSelectHive_1_1QL.java    License: Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example 12
Source Project: nifi   Source File: TestSelectHiveQL.java    License: Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example 13
Source Project: nifi   Source File: TestJdbcCommon.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testConvertToAvroStreamForShort() throws SQLException, IOException {
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(Types.TINYINT);
    when(metadata.getColumnName(1)).thenReturn("t_int");
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final short s = 25;
    when(rs.getObject(Mockito.anyInt())).thenReturn(s);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Short.toString(s), record.get("t_int").toString());
        }
    }
}
 
Example 14
Source Project: datafu   Source File: AvroDateRangeMetadata.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Reads the date range from the metadata stored in an Avro file.
 * 
 * @param fs file system to access path
 * @param path path to get date range for
 * @return date range
 * @throws IOException IOException
 */
public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException
{
  path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  
  try
  {
    return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                         new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
  }
  finally
  {
    dataFileStream.close();
    dataInputStream.close();
  }
}
 
Example 15
Source Project: datafu   Source File: Examples.java    License: Apache License 2.0 6 votes vote down vote up
private Long loadMemberCount(Path path, String timestamp) throws IOException
{
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      GenericRecord r = dataFileStream.next();
      Long count = (Long)((GenericRecord)r.get("value")).get("count");   
      Assert.assertNotNull(count);       
      System.out.println("found count: " + count);
      return count;
    }
    finally
    {
      dataFileStream.close();
    }
  }
  throw new RuntimeException("found no data");
}
 
Example 16
Source Project: nifi   Source File: QueryDatabaseTableTest.java    License: Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example 17
Source Project: presto   Source File: AvroRowDecoder.java    License: Apache License 2.0 5 votes vote down vote up
private void closeQuietly(DataFileStream<GenericRecord> stream)
{
    try {
        if (stream != null) {
            stream.close();
        }
    }
    catch (IOException ignored) {
    }
}
 
Example 18
public PentahoAvroRecordReader( DataFileStream<GenericRecord> nativeAvroRecordReader,
                                Schema avroSchema, List<? extends IAvroInputField> fields ) {
  this.nativeAvroRecordReader = nativeAvroRecordReader;
  this.avroSchema = avroSchema;
  this.legacySchema = isLegacySchema( avroSchema );
  this.fields = fields;
}
 
Example 19
Source Project: localization_nifi   Source File: TestJdbcCommon.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testConvertToBytes() throws ClassNotFoundException, SQLException, IOException {
    final Statement st = con.createStatement();
    st.executeUpdate("insert into restaurants values (1, 'Irifunes', 'San Mateo')");
    st.executeUpdate("insert into restaurants values (2, 'Estradas', 'Daly City')");
    st.executeUpdate("insert into restaurants values (3, 'Prime Rib House', 'San Francisco')");

    final ResultSet resultSet = st.executeQuery("select R.*, ROW_NUMBER() OVER () as rownr from restaurants R");

    final ByteArrayOutputStream outStream = new ByteArrayOutputStream();
    JdbcCommon.convertToAvroStream(resultSet, outStream, false);

    final byte[] serializedBytes = outStream.toByteArray();
    assertNotNull(serializedBytes);
    System.out.println("Avro serialized result size in bytes: " + serializedBytes.length);

    st.close();

    // Deserialize bytes to records
    final InputStream instream = new ByteArrayInputStream(serializedBytes);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            System.out.println(record);
        }
    }
}
 
Example 20
Source Project: nifi   Source File: TestWriteAvroResultWithSchema.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected GenericRecord readRecord(final InputStream in, final Schema schema) throws IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(in, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    return avroRecord;
}
 
Example 21
Source Project: incubator-pinot   Source File: AvroUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Get the Avro file reader for the given file.
 */
public static DataFileStream<GenericRecord> getAvroReader(File avroFile)
    throws IOException {
  if (avroFile.getName().endsWith(".gz")) {
    return new DataFileStream<>(new GZIPInputStream(new FileInputStream(avroFile)), new GenericDatumReader<>());
  } else {
    return new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<>());
  }
}
 
Example 22
FakePartitionLevelConsumer(int partition, StreamConfig streamConfig) {

    // TODO: this logic can move to a FakeStreamProducer instead of being inside the Consumer
    File tempDir = new File(FileUtils.getTempDirectory(), getClass().getSimpleName());
    File outputDir = new File(tempDir, String.valueOf(partition));

    int offset = 0;

    try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream(65536)) {
      File avroFile = unpackAvroTarFile(outputDir).get(0);

      int numPartitions = FakeStreamConfigUtils.getNumPartitions(streamConfig);

      try (DataFileStream<GenericRecord> reader = AvroUtils.getAvroReader(avroFile)) {
        BinaryEncoder binaryEncoder = new EncoderFactory().directBinaryEncoder(outputStream, null);
        GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(reader.getSchema());

        int recordNumber = 0;
        for (GenericRecord genericRecord : reader) {
          if (getPartitionNumber(recordNumber++, numPartitions) != partition) {
            continue;
          }
          outputStream.reset();

          datumWriter.write(genericRecord, binaryEncoder);
          binaryEncoder.flush();

          byte[] bytes = outputStream.toByteArray();
          // contiguous offsets
          messageOffsets.add(offset++);
          messageBytes.add(bytes);
        }
      }
    } catch (Exception e) {
      LOGGER.error("Could not create {}", FakePartitionLevelConsumer.class.getName(), e);
    } finally {
      FileUtils.deleteQuietly(outputDir);
    }
  }
 
Example 23
Source Project: nifi   Source File: EmbeddedAvroSchemaAccessStrategy.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public RecordSchema getSchema(Map<String, String> variables, final InputStream contentStream, final RecordSchema readSchema) throws SchemaNotFoundException, IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(contentStream, new GenericDatumReader<GenericRecord>());
    final Schema avroSchema = dataFileStream.getSchema();
    final RecordSchema recordSchema = AvroTypeUtil.createSchema(avroSchema);
    return recordSchema;
}
 
Example 24
Source Project: localization_nifi   Source File: SplitAvro.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException {
    writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());

    if (transferMetadata) {
        for (String metaKey : reader.getMetaKeys()) {
            if (!RESERVED_METADATA.contains(metaKey)) {
                writer.setMeta(metaKey, reader.getMeta(metaKey));
            }
        }
    }

    writer.setCodec(CodecFactory.fromString(codec));
    writer.create(reader.getSchema(), out);
}
 
Example 25
Source Project: incubator-pinot   Source File: BlocksTest.java    License: Apache License 2.0 5 votes vote down vote up
@BeforeClass
  public static void before()
      throws Exception {
    final String filePath = TestUtils.getFileFromResourceUrl(BlocksTest.class.getClassLoader().getResource(AVRO_DATA));
    if (INDEX_DIR.exists()) {
      FileUtils.deleteQuietly(INDEX_DIR);
    }

//    System.out.println(INDEX_DIR.getAbsolutePath());
    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);

    final SegmentGeneratorConfig config = SegmentTestUtils
        .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "daysSinceEpoch", TimeUnit.DAYS,
            "test");
    config.setTimeColumnName("daysSinceEpoch");
    driver.init(config);
    driver.build();

    final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
    final org.apache.avro.Schema avroSchema = avroReader.getSchema();
    final String[] columns = new String[avroSchema.getFields().size()];
    int i = 0;
    for (final Field f : avroSchema.getFields()) {
      columns[i] = f.name();
      i++;
    }
  }
 
Example 26
Source Project: nifi   Source File: TestAvroTypeUtil.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMapWithNullSchema() throws IOException {

    Schema recursiveSchema = new Schema.Parser().parse(getClass().getResourceAsStream("schema.json"));

    // Make sure the following doesn't throw an exception
    RecordSchema recordASchema = AvroTypeUtil.createSchema(recursiveSchema.getTypes().get(0));

    // check the fix with the proper file
    try (DataFileStream<GenericRecord> r = new DataFileStream<>(getClass().getResourceAsStream("data.avro"),
            new GenericDatumReader<>())) {
        GenericRecord n = r.next();
        AvroTypeUtil.convertAvroRecordToMap(n, recordASchema, StandardCharsets.UTF_8);
    }
}
 
Example 27
Source Project: nifi   Source File: SplitAvro.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void init(final DataFileStream<GenericRecord> reader, final String codec, final OutputStream out) throws IOException {
    writer = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>());

    if (transferMetadata) {
        for (String metaKey : reader.getMetaKeys()) {
            if (!RESERVED_METADATA.contains(metaKey)) {
                writer.setMeta(metaKey, reader.getMeta(metaKey));
            }
        }
    }

    writer.setCodec(CodecFactory.fromString(codec));
    writer.create(reader.getSchema(), out);
}
 
Example 28
Source Project: nifi   Source File: TestPutHiveStreaming.java    License: Apache License 2.0 5 votes vote down vote up
private void assertOutputAvroRecords(List<Map<String, Object>> expectedRecords, MockFlowFile resultFlowFile) throws IOException {
    assertEquals(String.valueOf(expectedRecords.size()), resultFlowFile.getAttribute(PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR));

    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<GenericRecord>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertTrue(schema.equals(new Schema.Parser().parse(new File("src/test/resources/user.avsc"))));

    GenericRecord record = null;
    for (Map<String, Object> expectedRecord : expectedRecords) {
        assertTrue(reader.hasNext());
        record = reader.next(record);
        final String name = record.get("name").toString();
        final Integer favorite_number = (Integer) record.get("favorite_number");
        assertNotNull(name);
        assertNotNull(favorite_number);
        assertNull(record.get("favorite_color"));
        assertNull(record.get("scale"));

        assertEquals(expectedRecord.get("name"), name);
        assertEquals(expectedRecord.get("favorite_number"), favorite_number);
    }
    assertFalse(reader.hasNext());
}
 
Example 29
private void setValidationConfigs(Job job, Path path)
    throws IOException {
  SegmentsValidationAndRetentionConfig validationConfig = _tableConfig.getValidationConfig();

  // TODO: Serialize and deserialize validation config by creating toJson and fromJson
  // If the use case is an append use case, check that one time unit is contained in one file. If there is more than one,
  // the job should be disabled, as we should not resize for these use cases. Therefore, setting the time column name
  // and value
  if (validationConfig.getSegmentPushType().equalsIgnoreCase("APPEND")) {
    job.getConfiguration().set(InternalConfigConstants.IS_APPEND, "true");
    String timeColumnName = validationConfig.getTimeColumnName();
    job.getConfiguration().set(InternalConfigConstants.TIME_COLUMN_CONFIG, timeColumnName);
    if (timeColumnName != null) {
      DateTimeFieldSpec dateTimeFieldSpec = _pinotTableSchema.getSpecForTimeColumn(timeColumnName);
      if (dateTimeFieldSpec != null) {
        DateTimeFormatSpec formatSpec = new DateTimeFormatSpec(dateTimeFieldSpec.getFormat());
        job.getConfiguration()
            .set(InternalConfigConstants.SEGMENT_TIME_TYPE, formatSpec.getColumnUnit().toString());
        job.getConfiguration()
            .set(InternalConfigConstants.SEGMENT_TIME_FORMAT, formatSpec.getTimeFormat().toString());
        job.getConfiguration()
            .set(InternalConfigConstants.SEGMENT_TIME_SDF_PATTERN, formatSpec.getSDFPattern());
      }
    }
    job.getConfiguration()
        .set(InternalConfigConstants.SEGMENT_PUSH_FREQUENCY, validationConfig.getSegmentPushFrequency());
    try (DataFileStream<GenericRecord> dataStreamReader = getAvroReader(path)) {
      job.getConfiguration()
          .set(InternalConfigConstants.TIME_COLUMN_VALUE, dataStreamReader.next().get(timeColumnName).toString());
    }
  }
}
 
Example 30
/**
 * Return the Avro file reader
 * @param is the {@link java.io.InputStream} input stream of the source Avro file
 * @param sourceFileName the source Avro file name
 */
private DataFileStream<GenericRecord> getFileReader(InputStream is, String sourceFileName) throws StageException {
  try {
    DatumReader<GenericRecord> reader = new GenericDatumReader<>();
    DataFileStream<GenericRecord> fileReader = new DataFileStream<>(is, reader);
    return fileReader;
  } catch (IOException ex) {
    throw new TransformerStageCheckedException(Errors.CONVERT_11, sourceFileName, ex);
  }
}