org.apache.avro.file.DataFileStream Java Examples

The following examples show how to use org.apache.avro.file.DataFileStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroStorageUtils.java    From Cubert with Apache License 2.0 6 votes vote down vote up
/**
 * This method is called by {@link #getAvroSchema}. The default implementation
 * returns the schema of an avro file; or the schema of the last file in a first-level
 * directory (it does not contain sub-directories).
 *
 * @param path  path of a file or first level directory
 * @param fs  file system
 * @return avro schema
 * @throws IOException
 */
public static Schema getSchema(Path path, FileSystem fs) throws IOException {
    /* get path of the last file */
    Path lastFile = AvroStorageUtils.getLast(path, fs);
    if (lastFile == null) {
        return null;
    }

    /* read in file and obtain schema */
    GenericDatumReader<Object> avroReader = new GenericDatumReader<Object>();
    InputStream hdfsInputStream = fs.open(lastFile);
    DataFileStream<Object> avroDataStream = new DataFileStream<Object>(hdfsInputStream, avroReader);
    Schema ret = avroDataStream.getSchema();
    avroDataStream.close();

    return ret;
}
 
Example #2
Source File: PutHiveStreaming.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
private void appendRecordsToFlowFile(ProcessSession session,
                                     List<HiveStreamingRecord> records,
                                     AtomicReference<FlowFile> appendFlowFile,
                                     DataFileWriter<GenericRecord> avroWriter,
                                     DataFileStream<GenericRecord> reader) throws IOException {

    appendFlowFile.set(session.append(appendFlowFile.get(), (out) -> {

        try (DataFileWriter<GenericRecord> writer = avroWriter.create(reader.getSchema(), out)) {
            for (HiveStreamingRecord sRecord : records) {
                writer.append(sRecord.getRecord());
            }
            writer.flush();
        }
    }));
}
 
Example #3
Source File: TestSelectHive3QL.java    From nifi with Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example #4
Source File: TestSplitAvro.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}
 
Example #5
Source File: TestSelectHive_1_1QL.java    From nifi with Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example #6
Source File: TestSelectHiveQL.java    From nifi with Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example #7
Source File: TestJdbcCommon.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testConvertToAvroStreamForShort() throws SQLException, IOException {
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(Types.TINYINT);
    when(metadata.getColumnName(1)).thenReturn("t_int");
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final short s = 25;
    when(rs.getObject(Mockito.anyInt())).thenReturn(s);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Short.toString(s), record.get("t_int").toString());
        }
    }
}
 
Example #8
Source File: PutHiveStreaming.java    From nifi with Apache License 2.0 6 votes vote down vote up
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}
 
Example #9
Source File: SegmentCreationPhaseMapReduceJob.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
private LongColumnPreIndexStatsCollector getTimeColumnStatsCollector(Schema schema, File localAvroFile)
    throws FileNotFoundException, IOException {
  String timeColumnName = schema.getTimeColumnName();
  FieldSpec spec =  schema.getTimeFieldSpec();
  LOGGER.info("Spec for " + timeColumnName + " is " + spec);
  LongColumnPreIndexStatsCollector timeColumnStatisticsCollector = new LongColumnPreIndexStatsCollector(spec.getName(), new StatsCollectorConfig(schema, null));
  LOGGER.info("StatsCollector :" + timeColumnStatisticsCollector);
  DataFileStream<GenericRecord> dataStream =
      new DataFileStream<GenericRecord>(new FileInputStream(localAvroFile), new GenericDatumReader<GenericRecord>());
  while (dataStream.hasNext()) {
    GenericRecord next = dataStream.next();
    timeColumnStatisticsCollector.collect(next.get(timeColumnName));
  }
  dataStream.close();
  timeColumnStatisticsCollector.seal();

  return timeColumnStatisticsCollector;
}
 
Example #10
Source File: AvroDateRangeMetadata.java    From datafu with Apache License 2.0 6 votes vote down vote up
/**
 * Reads the date range from the metadata stored in an Avro file.
 * 
 * @param fs file system to access path
 * @param path path to get date range for
 * @return date range
 * @throws IOException IOException
 */
public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException
{
  path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  
  try
  {
    return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                         new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
  }
  finally
  {
    dataFileStream.close();
    dataInputStream.close();
  }
}
 
Example #11
Source File: Examples.java    From datafu with Apache License 2.0 6 votes vote down vote up
private Long loadMemberCount(Path path, String timestamp) throws IOException
{
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      GenericRecord r = dataFileStream.next();
      Long count = (Long)((GenericRecord)r.get("value")).get("count");   
      Assert.assertNotNull(count);       
      System.out.println("found count: " + count);
      return count;
    }
    finally
    {
      dataFileStream.close();
    }
  }
  throw new RuntimeException("found no data");
}
 
Example #12
Source File: HadoopSegmentPreprocessingJob.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
/**
 * Finds the avro file in the input folder, and returns its avro schema
 * @param inputPathDir Path to input directory
 * @return Input schema
 * @throws IOException exception when accessing to IO
 */
private Schema getSchema(Path inputPathDir)
    throws IOException {
  FileSystem fs = FileSystem.get(new Configuration());
  Schema avroSchema = null;
  for (FileStatus fileStatus : fs.listStatus(inputPathDir)) {
    if (fileStatus.isFile() && fileStatus.getPath().getName().endsWith(".avro")) {
      _logger.info("Extracting schema from " + fileStatus.getPath());
      try (DataFileStream<GenericRecord> dataStreamReader = getAvroReader(inputPathDir)) {
        avroSchema = dataStreamReader.getSchema();
      }
      break;
    }
  }
  return avroSchema;
}
 
Example #13
Source File: PentahoAvroInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
private DataFileStream<Object> createNestedDataFileStream() throws Exception {
  DatumReader<Object> datumReader;
  if ( useFieldAsInputStream ) {
    datumReader = new GenericDatumReader<Object>();
    inputStream.reset();
    return new DataFileStream<Object>( inputStream, datumReader );
  }
  if ( schemaFileName != null && schemaFileName.length() > 0 ) {
    Schema schema = new Schema.Parser().parse( KettleVFS.getInputStream( schemaFileName, variableSpace ) );
    datumReader = new GenericDatumReader<Object>( schema );
  } else {
    datumReader = new GenericDatumReader<Object>();
  }
  FileObject fileObject = KettleVFS.getFileObject( fileName, variableSpace );
  if ( fileObject.isFile() ) {
    this.inputStream = fileObject.getContent().getInputStream();
    return new DataFileStream<>( inputStream, datumReader );
  } else {
    FileObject[] avroFiles = fileObject.findFiles( new FileExtensionSelector( "avro" ) );
    if ( !Utils.isEmpty( avroFiles ) ) {
      this.inputStream = avroFiles[ 0 ].getContent().getInputStream();
      return new DataFileStream<>( inputStream, datumReader );
    }
    return null;
  }
}
 
Example #14
Source File: TestSplitAvro.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}
 
Example #15
Source File: TestJdbcCommonConvertToAvro.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testConvertToAvroStreamForNumbers() throws SQLException, IOException {
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(testParams.sqlType);
    when(metadata.isSigned(1)).thenReturn(testParams.signed);
    when(metadata.getPrecision(1)).thenReturn(testParams.precision);
    when(metadata.getColumnName(1)).thenReturn("t_int");
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final int ret = 0;
    when(rs.getObject(Mockito.anyInt())).thenReturn(ret);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Integer.toString(ret), record.get("t_int").toString());
        }
    }
}
 
Example #16
Source File: QueryDatabaseTableTest.java    From nifi with Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example #17
Source File: TestHDFSCompressedDataStream.java    From mt-flume with Apache License 2.0 5 votes vote down vote up
@Test
public void testGzipDurabilityWithSerializer() throws Exception {
  Context context = new Context();
  context.put("serializer", "AVRO_EVENT");

  HDFSCompressedDataStream writer = new HDFSCompressedDataStream();
  writer.configure(context);

  writer.open(fileURI, factory.getCodec(new Path(fileURI)),
      SequenceFile.CompressionType.BLOCK);

  String[] bodies = { "yarf!", "yarfing!" };
  writeBodies(writer, bodies);

  int found = 0;
  int expected = bodies.length;
  List<String> expectedBodies = Lists.newArrayList(bodies);

  GZIPInputStream cmpIn = new GZIPInputStream(new FileInputStream(file));
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> avroStream =
      new DataFileStream<GenericRecord>(cmpIn, reader);
  GenericRecord record = new GenericData.Record(avroStream.getSchema());
  while (avroStream.hasNext()) {
    avroStream.next(record);
    CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
    String bodyStr = decoder.decode((ByteBuffer) record.get("body"))
        .toString();
    expectedBodies.remove(bodyStr);
    found++;
  }
  avroStream.close();
  cmpIn.close();

  Assert.assertTrue("Found = " + found + ", Expected = " + expected
      + ", Left = " + expectedBodies.size() + " " + expectedBodies,
      expectedBodies.size() == 0);
}
 
Example #18
Source File: AvroStockFileRead.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public static void dumpStream(InputStream is) throws IOException {
  DataFileStream<Stock> reader =
      new DataFileStream<Stock>(
          is,
          new SpecificDatumReader<Stock>(Stock.class));

  for (Stock a : reader) {
    System.out.println(ToStringBuilder.reflectionToString(a,
        ToStringStyle.SIMPLE_STYLE
    ));
  }

  IOUtils.closeStream(is);
  IOUtils.closeStream(reader);
}
 
Example #19
Source File: AvroUtils.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
/**
 * Given an Avro data file, map from column to field type and time unit, return the equivalent Pinot schema.
 *
 * @param avroDataFile Avro data file
 * @param fieldTypeMap Map from column to field type
 * @param timeUnit Time unit
 * @return Pinot schema
 */
public static Schema getPinotSchemaFromAvroDataFile(File avroDataFile,
    @Nullable Map<String, FieldSpec.FieldType> fieldTypeMap, @Nullable TimeUnit timeUnit)
    throws IOException {
  try (DataFileStream<GenericRecord> reader = getAvroReader(avroDataFile)) {
    org.apache.avro.Schema avroSchema = reader.getSchema();
    return getPinotSchemaFromAvroSchema(avroSchema, fieldTypeMap, timeUnit);
  }
}
 
Example #20
Source File: PartitionPreservingJoinTests.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>();
  FileSystem fs = getFileSystem();
  String nestedPath = getNestedPathFromTimestamp(timestamp);
  Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions");    
        Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks");         
        Assert.assertFalse(counts.containsKey(memberId));
        ImpressionClick data = new ImpressionClick();
        data.clicks = clicks;
        data.impressions = impressions;
        counts.put(memberId, data);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example #21
Source File: ParquetUtils.java    From incubator-pinot with Apache License 2.0 5 votes vote down vote up
/**
 * Get the Avro file reader for the given file.
 */
public static DataFileStream<GenericRecord> getAvroReader(File avroFile)
    throws IOException {
  if (avroFile.getName().endsWith(".gz")) {
    return new DataFileStream<>(new GZIPInputStream(new FileInputStream(avroFile)), new GenericDatumReader<>());
  } else {
    return new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<>());
  }
}
 
Example #22
Source File: PartitionPreservingCollapsingIntegrationTests.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,Long> loadOutputCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example #23
Source File: Schemas.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static Schema fromAvro(InputStream in) throws IOException {
  GenericDatumReader<GenericRecord> datumReader =
      new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> stream = null;
  boolean threw = true;

  try {
    stream = new DataFileStream<>(in, datumReader);
    Schema schema = stream.getSchema();
    threw = false;
    return schema;
  } finally {
    Closeables.close(stream, threw);
  }
}
 
Example #24
Source File: AvroUtils.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
/**
 * Loads the schema from an Avro data file.
 * 
 * @param conf The JobConf.
 * @param path The path to the data file.
 * @return The schema read from the data file's metadata.
 * @throws IOException
 */
public static Schema getSchemaFromFile(JobConf conf, Path path) throws IOException
{
  FileSystem fs = path.getFileSystem(new Configuration());
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  return dataFileStream.getSchema();
}
 
Example #25
Source File: AvroDump.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public static void readFromAvro(InputStream is) throws IOException {
  DataFileStream<Object> reader =
      new DataFileStream<Object>(
          is, new GenericDatumReader<Object>());
  for (Object o : reader) {
    System.out.println(o);
  }
  IOUtils.closeStream(is);
  IOUtils.closeStream(reader);
}
 
Example #26
Source File: TestJdbcCommon.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testConvertToAvroStreamForUnsignedIntegerWithPrecision10() throws SQLException, IOException {
    final String mockColumnName = "t_int";
    final ResultSetMetaData metadata = mock(ResultSetMetaData.class);
    when(metadata.getColumnCount()).thenReturn(1);
    when(metadata.getColumnType(1)).thenReturn(Types.INTEGER);
    when(metadata.isSigned(1)).thenReturn(false);
    when(metadata.getPrecision(1)).thenReturn(10);
    when(metadata.getColumnName(1)).thenReturn(mockColumnName);
    when(metadata.getTableName(1)).thenReturn("table");

    final ResultSet rs = JdbcCommonTestUtils.resultSetReturningMetadata(metadata);

    final Long ret = 0L;
    when(rs.getObject(Mockito.anyInt())).thenReturn(ret);

    final InputStream instream = JdbcCommonTestUtils.convertResultSetToAvroInputStream(rs);

    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) {
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next(record);
            assertEquals(Long.toString(ret), record.get(mockColumnName).toString());
        }
    }
}
 
Example #27
Source File: GeoWaveAvroIngestTest.java    From geowave with Apache License 2.0 5 votes vote down vote up
private boolean validate(final URL file) {
  try (DataFileStream<AvroSimpleFeatureCollection> ds =
      new DataFileStream<>(
          file.openStream(),
          new SpecificDatumReader<AvroSimpleFeatureCollection>(
              AvroSimpleFeatureCollection.getClassSchema()))) {
    if (ds.getHeader() != null) {
      return true;
    }
  } catch (final IOException e) {
    // Do nothing for now
  }

  return false;
}
 
Example #28
Source File: AvroFileReader.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
public <T> void build(String filePath, AvroConsumer<T> builder) throws IOException
{
  
  List<Path> paths = getPaths(filePath);

  for (Path path: paths)
  {
    DataFileStream<Object> stream = null;
    try
    {
      stream = getAvroDataStream(path);
      while (stream.hasNext())
      {
        builder.consume(stream.next());
      }
    }
    finally
    {
      if (stream != null)
      {
        stream.close();
      }
    }
  }
  
  builder.done();
}
 
Example #29
Source File: RegressionAdmmTrain.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
private void computeU(JobConf conf, String uPath, String uplusxPath, Map<String, LinearModel> z) throws IOException
{
  AvroHdfsFileWriter<GenericRecord> writer =
      new AvroHdfsFileWriter<GenericRecord>(conf, uPath, LinearModelAvro.SCHEMA$);
  DataFileWriter<GenericRecord> recordwriter = writer.get();
  // read u+x
  for (Path path : Util.findPartFiles(conf, new Path(uplusxPath)))
  {
    DataFileStream<Object> stream = AvroUtils.getAvroDataStream(conf, path);
    while (stream.hasNext())
    {
      GenericData.Record record = (GenericData.Record) stream.next();
      String partitionID = Util.getStringAvro(record, "key", false);
      if (record.get("uplusx") != null)
      {
        String lambda = Util.getLambda(partitionID);
        LinearModel newu =
            new LinearModel(LibLinearDataset.INTERCEPT_NAME, (List<?>) record.get("uplusx"));
        newu.linearCombine(1.0, -1.0, z.get(lambda));
        GenericData.Record newvaluemap =
            new GenericData.Record(LinearModelAvro.SCHEMA$);
        List modellist = newu.toAvro(LibLinearDataset.INTERCEPT_NAME);
        newvaluemap.put("key", partitionID);
        newvaluemap.put("model", modellist);
        recordwriter.append(newvaluemap);
      }
    }
  }
  recordwriter.close();
}
 
Example #30
Source File: AvroStockAvgFileRead.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public static void readFromAvro(InputStream is) throws IOException {
  DataFileStream<StockAvg> reader =     //<co id="ch03_smallfileread_comment1"/>
      new DataFileStream<StockAvg>(
          is,
          new SpecificDatumReader<StockAvg>(StockAvg.class));

  for (StockAvg a : reader) {          //<co id="ch03_smallfileread_comment2"/>
    System.out.println(ToStringBuilder.reflectionToString(a,
        ToStringStyle.SHORT_PREFIX_STYLE
    ));
  }

  IOUtils.closeStream(is);
  IOUtils.closeStream(reader);
}