Java Code Examples for org.apache.avro.file.FileReader

The following examples show how to use org.apache.avro.file.FileReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: AvroIterable.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public CloseableIterator<D> iterator() {
  FileReader<D> fileReader = initMetadata(newFileReader());

  if (start != null) {
    fileReader = new AvroRangeIterator<>(fileReader, start, end);
  }

  addCloseable(fileReader);

  if (reuseContainers) {
    return new AvroReuseIterator<>(fileReader);
  }

  return CloseableIterator.withClose(fileReader);
}
 
Example 2
Source Project: iceberg   Source File: AvroIterable.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<D> iterator() {
  FileReader<D> reader = initMetadata(newFileReader());

  if (start != null) {
    reader = new AvroRangeIterator<>(reader, start, end);
  }

  if (reuseContainers) {
    return new AvroReuseIterator<>(reader);
  }

  addCloseable(reader);

  return reader;
}
 
Example 3
Source Project: datacollector   Source File: AvroToOrcRecordConverter.java    License: Apache License 2.0 6 votes vote down vote up
public void convert(SeekableInput avroInputFile, Path orcOutputFile) throws IOException {
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroInputFile, reader)) {
    Schema avroSchema = fileReader.getSchema();

    initializeWriter(avroSchema, orcOutputFile);

    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();

      addAvroRecord(record);
    }

    closeWriter();
  }
}
 
Example 4
@Test(groups = {"ignore"})
public void testRowLevelPolicy()
    throws Exception {
  State state = new State();
  state.setProp(ConfigurationKeys.ROW_LEVEL_POLICY_LIST, "org.apache.gobblin.qualitychecker.TestRowLevelPolicy");
  state.setProp(ConfigurationKeys.ROW_LEVEL_POLICY_LIST_TYPE, "FAIL");

  RowLevelPolicyChecker checker =
      new RowLevelPolicyCheckerBuilderFactory().newPolicyCheckerBuilder(state, -1).build();
  RowLevelPolicyCheckResults results = new RowLevelPolicyCheckResults();

  FileReader<GenericRecord> fileReader = openFile(state);

  for (GenericRecord datum : fileReader) {
    Assert.assertTrue(checker.executePolicies(datum, results));
  }
}
 
Example 5
@Test(groups = {"ignore"})
public void testWriteToErrFile()
    throws Exception {
  State state = new State();
  state.setProp(ConfigurationKeys.ROW_LEVEL_POLICY_LIST, "org.apache.gobblin.qualitychecker.TestRowLevelPolicyFail");
  state.setProp(ConfigurationKeys.ROW_LEVEL_POLICY_LIST_TYPE, "ERR_FILE");
  state.setProp(ROW_LEVEL_ERR_FILE, TestConstants.TEST_ERR_FILE);
  state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, TestConstants.TEST_FS_URI);

  RowLevelPolicyChecker checker =
      new RowLevelPolicyCheckerBuilderFactory().newPolicyCheckerBuilder(state, -1).build();
  RowLevelPolicyCheckResults results = new RowLevelPolicyCheckResults();

  FileReader<GenericRecord> fileReader = openFile(state);

  for (GenericRecord datum : fileReader) {
    Assert.assertFalse(checker.executePolicies(datum, results));
  }

  FileSystem fs = FileSystem.get(new URI(TestConstants.TEST_FS_URI), new Configuration());
  Path outputPath = new Path(TestConstants.TEST_ERR_FILE,
      state.getProp(ConfigurationKeys.ROW_LEVEL_POLICY_LIST).replaceAll("\\.", "-") + ".err");
  Assert.assertTrue(fs.exists(outputPath));
  fs.delete(new Path(TestConstants.TEST_ERR_FILE), true);
}
 
Example 6
Source Project: celos   Source File: AvroToJsonConverter.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public FixFile convert(TestRun testRun, FixFile ff) throws IOException {
    byte[] bytes = IOUtils.toByteArray(ff.getContent());
    if (bytes.length == 0) {
        return ff;
    }
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    GenericDatumReader<Object> reader = new GenericDatumReader<>();
    FileReader<Object> fileReader =  DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader);
    try {
        Schema schema = fileReader.getSchema();
        DatumWriter<Object> writer = new GenericDatumWriter<>(schema);
        JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, os);

        for (Object datum : fileReader) {
            writer.write(datum, encoder);
        }
        encoder.flush();
    } finally {
        fileReader.close();
    }
    return new FixFile(new ByteArrayInputStream(os.toByteArray()));
}
 
Example 7
Source Project: kite   Source File: AvroMorphlineTest.java    License: Apache License 2.0 6 votes vote down vote up
private void runTweetContainer(String morphlineConfigFile, String[] fieldNames) throws Exception {
  File file = new File(RESOURCES_DIR + "/test-documents/sample-statuses-20120906-141433-medium.avro");
  morphline = createMorphline(morphlineConfigFile);    
  for (int j = 0; j < 3; j++) { // also test reuse of objects and low level avro buffers
    Record record = new Record();
    byte[] body = Files.toByteArray(file);    
    record.put(Fields.ATTACHMENT_BODY, body);
    collector.reset();
    startSession();
    Notifications.notifyBeginTransaction(morphline);
    assertTrue(morphline.process(record));
    assertEquals(1, collector.getNumStartEvents());
    assertEquals(2104, collector.getRecords().size());
    
    FileReader<GenericData.Record> reader = new DataFileReader(file, new GenericDatumReader());
    int i = 0;
    while (reader.hasNext()) {
      Record actual = collector.getRecords().get(i);
      GenericData.Record expected = reader.next();
      assertTweetEquals(expected, actual, fieldNames, i);
      i++;
    }    
    assertEquals(collector.getRecords().size(), i);
  }
}
 
Example 8
Source Project: Flink-CEPplus   Source File: AvroRecordInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This test validates proper serialization with specific (generated POJO) types.
 */
@Test
public void testDeserializeToSpecificType() throws IOException {

	DatumReader<User> datumReader = new SpecificDatumReader<>(userSchema);

	try (FileReader<User> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
		User rec = dataFileReader.next();

		// check if record has been read correctly
		assertNotNull(rec);
		assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());

		// now serialize it with our framework:
		ExecutionConfig ec = new ExecutionConfig();
		TypeInformation<User> te = TypeExtractor.createTypeInfo(User.class);

		assertEquals(AvroTypeInfo.class, te.getClass());
		TypeSerializer<User> tser = te.createSerializer(ec);

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
			tser.serialize(rec, outView);
		}

		User newRec;
		try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(
				new ByteArrayInputStream(out.toByteArray()))) {
			newRec = tser.deserialize(inView);
		}

		// check if it is still the same
		assertNotNull(newRec);
		assertEquals("name not equal", TEST_NAME, newRec.getName().toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.getTypeEnum().toString());
	}
}
 
Example 9
Source Project: flink   Source File: AvroRecordInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This test validates proper serialization with specific (generated POJO) types.
 */
@Test
public void testDeserializeToSpecificType() throws IOException {

	DatumReader<User> datumReader = new SpecificDatumReader<>(userSchema);

	try (FileReader<User> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
		User rec = dataFileReader.next();

		// check if record has been read correctly
		assertNotNull(rec);
		assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());

		// now serialize it with our framework:
		ExecutionConfig ec = new ExecutionConfig();
		TypeInformation<User> te = TypeExtractor.createTypeInfo(User.class);

		assertEquals(AvroTypeInfo.class, te.getClass());
		TypeSerializer<User> tser = te.createSerializer(ec);

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
			tser.serialize(rec, outView);
		}

		User newRec;
		try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(
				new ByteArrayInputStream(out.toByteArray()))) {
			newRec = tser.deserialize(inView);
		}

		// check if it is still the same
		assertNotNull(newRec);
		assertEquals("name not equal", TEST_NAME, newRec.getName().toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.getTypeEnum().toString());
	}
}
 
Example 10
Source Project: iceberg   Source File: AvroIterable.java    License: Apache License 2.0 5 votes vote down vote up
AvroRangeIterator(FileReader<D> reader, long start, long end) {
  this.reader = reader;
  this.end = end;

  try {
    reader.sync(start);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to find sync past position %d", start);
  }
}
 
Example 11
Source Project: iceberg   Source File: AvroIterable.java    License: Apache License 2.0 5 votes vote down vote up
AvroRangeIterator(FileReader<D> reader, long start, long end) {
  this.reader = reader;
  this.end = end;

  try {
    reader.sync(start);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to find sync past position %d", start);
  }
}
 
Example 12
/**
 * Get the schema of AVRO files stored in a directory
 */
public static Schema getAvroSchema(Path path, Configuration conf)
    throws IOException {
  FileSystem fs = path.getFileSystem(conf);
  Path fileToTest;
  if (fs.isDirectory(path)) {
    FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
      @Override
      public boolean accept(Path p) {
        String name = p.getName();
        return !name.startsWith("_") && !name.startsWith(".");
      }
    });
    if (fileStatuses.length == 0) {
      return null;
    }
    fileToTest = fileStatuses[0].getPath();
  } else {
    fileToTest = path;
  }

  SeekableInput input = new FsInput(fileToTest, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);

  Schema result = fileReader.getSchema();
  fileReader.close();
  return result;
}
 
Example 13
private boolean checkAvroFileForLine(FileSystem fs, Path p, List<Integer> record)
    throws IOException {
  SeekableInput in = new FsInput(p, new Configuration());
  DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> reader = DataFileReader.openReader(in, datumReader);
  reader.sync(0);

  while (reader.hasNext()) {
    if (valueMatches(reader.next(), record)) {
      return true;
    }
  }

  return false;
}
 
Example 14
Source Project: streamx   Source File: AvroFileReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  org.apache.avro.Schema schema = fileReader.getSchema();
  fileReader.close();
  return avroData.toConnectSchema(schema);
}
 
Example 15
Source Project: streamx   Source File: AvroFileReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
  ArrayList<Object> collection = new ArrayList<>();
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  for (Object object: fileReader) {
    collection.add(object);
  }
  fileReader.close();
  return collection;
}
 
Example 16
Source Project: hudi   Source File: TimelineMetadataUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static <T extends SpecificRecordBase> T deserializeAvroMetadata(byte[] bytes, Class<T> clazz)
    throws IOException {
  DatumReader<T> reader = new SpecificDatumReader<>(clazz);
  FileReader<T> fileReader = DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader);
  ValidationUtils.checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz);
  return fileReader.next();
}
 
Example 17
Source Project: beam   Source File: FakeJobService.java    License: Apache License 2.0 5 votes vote down vote up
private List<TableRow> readAvroTableRows(String filename, TableSchema tableSchema)
    throws IOException {
  List<TableRow> tableRows = Lists.newArrayList();
  FileReader<GenericRecord> dfr =
      DataFileReader.openReader(new File(filename), new GenericDatumReader<>());

  while (dfr.hasNext()) {
    GenericRecord record = dfr.next(null);
    tableRows.add(BigQueryUtils.convertGenericRecordToTableRow(record, tableSchema));
  }
  return tableRows;
}
 
Example 18
Source Project: datacollector   Source File: ClusterHdfsSource.java    License: Apache License 2.0 5 votes vote down vote up
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException {
  int previewCount = previewBuffer.size();
  Path filePath = fileStatus.getPath();
  SeekableInput input = new FsInput(filePath, hadoopConf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  List<Map.Entry> batch = new ArrayList<>();
  try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader)) {
    int count = 0;
    while (fileReader.hasNext() && batch.size() < batchSize && previewCount < batchSize) {
      GenericRecord datum = fileReader.next();
      ByteArrayOutputStream out = new ByteArrayOutputStream();
      DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>
          (datum.getSchema()));
      try {
        dataFileWriter.create(datum.getSchema(), out);
        dataFileWriter.append(datum);
      } finally {
        dataFileWriter.close();
        out.close();
      }
      batch.add(new Pair(filePath.toUri().getPath() + "::" + count, out.toByteArray()));
      count++;
      previewCount++;
    }
  }
  return batch;
}
 
Example 19
Source Project: flink   Source File: AvroRecordInputFormatTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * This test validates proper serialization with specific (generated POJO) types.
 */
@Test
public void testDeserializeToSpecificType() throws IOException {

	DatumReader<User> datumReader = new SpecificDatumReader<>(userSchema);

	try (FileReader<User> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
		User rec = dataFileReader.next();

		// check if record has been read correctly
		assertNotNull(rec);
		assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());

		// now serialize it with our framework:
		ExecutionConfig ec = new ExecutionConfig();
		TypeInformation<User> te = TypeExtractor.createTypeInfo(User.class);

		assertEquals(AvroTypeInfo.class, te.getClass());
		TypeSerializer<User> tser = te.createSerializer(ec);

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
			tser.serialize(rec, outView);
		}

		User newRec;
		try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(
				new ByteArrayInputStream(out.toByteArray()))) {
			newRec = tser.deserialize(inView);
		}

		// check if it is still the same
		assertNotNull(newRec);
		assertEquals("name not equal", TEST_NAME, newRec.getName().toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.getTypeEnum().toString());
	}
}
 
Example 20
Source Project: incubator-gobblin   Source File: AvroUtilsTest.java    License: Apache License 2.0 5 votes vote down vote up
public static List<GenericRecord> getRecordFromFile(String path)
    throws IOException {
  Configuration config = new Configuration();
  SeekableInput input = new FsInput(new Path(path), config);
  DatumReader<GenericRecord> reader1 = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1);
  List<GenericRecord> records = new ArrayList<>();
  for (GenericRecord datum : fileReader) {
    records.add(datum);
  }
  fileReader.close();
  return records;
}
 
Example 21
Source Project: incubator-gobblin   Source File: TestAvroExtractor.java    License: Apache License 2.0 5 votes vote down vote up
public static List<GenericRecord> getRecordFromFile(String path)
    throws IOException {
  Configuration config = new Configuration();
  SeekableInput input = new FsInput(new Path(path), config);
  DatumReader<GenericRecord> reader1 = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1);
  List<GenericRecord> records = new ArrayList<>();
  for (GenericRecord datum : fileReader) {
    records.add(datum);
  }
  fileReader.close();
  return records;
}
 
Example 22
protected AvroAsTextRecordReaderCopy(FileReader<T> reader, FileSplit split)
        throws IOException {

    this.reader = reader;
    reader.sync(split.getStart());                    // sync to start
    this.start = reader.tell();
    this.end = split.getStart() + split.getLength();
}
 
Example 23
protected AvroAsTextRecordReader(FileReader<T> reader, FileSplit split)
        throws IOException {

    this.reader = reader;
    reader.sync(split.getStart());                    // sync to start
    this.start = reader.tell();
    this.end = split.getStart() + split.getLength();
    tsv = new GenericDataTSV();
}
 
Example 24
private static Stream<GenericRecord> listRecords(final Path avroFile) {
    final GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    logger.debug("Reading records from new Avro file: {}", avroFile);
    try (final FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroFile.toFile(), datumReader)) {
        final ImmutableList<GenericRecord> records = ImmutableList.copyOf(fileReader.iterator());
        logger.info("Read {} record(s) from new Avro file: {}", records.size(), avroFile);
        return records.stream();
    } catch (final IOException e) {
        throw new UncheckedIOException("Error reading records from file: " + avroFile, e);
    }
}
 
Example 25
Source Project: Flink-CEPplus   Source File: AvroRecordInputFormatTest.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Test if the Flink serialization is able to properly process GenericData.Record types.
 * Usually users of Avro generate classes (POJOs) from Avro schemas.
 * However, if generated classes are not available, one can also use GenericData.Record.
 * It is an untyped key-value record which is using a schema to validate the correctness of the data.
 *
 * <p>It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead.
 */
@Test
public void testDeserializeToGenericType() throws IOException {
	DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema);

	try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
		// initialize Record by reading it from disk (that's easier than creating it by hand)
		GenericData.Record rec = new GenericData.Record(userSchema);
		dataFileReader.next(rec);

		// check if record has been read correctly
		assertNotNull(rec);
		assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());
		assertEquals(null, rec.get("type_long_test")); // it is null for the first record.

		// now serialize it with our framework:
		TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class);

		ExecutionConfig ec = new ExecutionConfig();
		assertEquals(GenericTypeInfo.class, te.getClass());

		Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<>());

		TypeSerializer<GenericData.Record> tser = te.createSerializer(ec);
		assertEquals(1, ec.getDefaultKryoSerializerClasses().size());
		assertTrue(
				ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) &&
						ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(AvroKryoSerializerUtils.AvroSchemaSerializer.class));

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
			tser.serialize(rec, outView);
		}

		GenericData.Record newRec;
		try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(
				new ByteArrayInputStream(out.toByteArray()))) {
			newRec = tser.deserialize(inView);
		}

		// check if it is still the same
		assertNotNull(newRec);
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString());
		assertEquals("name not equal", TEST_NAME, newRec.get("name").toString());
		assertEquals(null, newRec.get("type_long_test"));
	}
}
 
Example 26
Source Project: flink   Source File: AvroRecordInputFormatTest.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Test if the Flink serialization is able to properly process GenericData.Record types.
 * Usually users of Avro generate classes (POJOs) from Avro schemas.
 * However, if generated classes are not available, one can also use GenericData.Record.
 * It is an untyped key-value record which is using a schema to validate the correctness of the data.
 *
 * <p>It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead.
 */
@Test
public void testDeserializeToGenericType() throws IOException {
	DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema);

	try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
		// initialize Record by reading it from disk (that's easier than creating it by hand)
		GenericData.Record rec = new GenericData.Record(userSchema);
		dataFileReader.next(rec);

		// check if record has been read correctly
		assertNotNull(rec);
		assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());
		assertEquals(null, rec.get("type_long_test")); // it is null for the first record.

		// now serialize it with our framework:
		TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class);

		ExecutionConfig ec = new ExecutionConfig();
		assertEquals(GenericTypeInfo.class, te.getClass());

		Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<>());

		TypeSerializer<GenericData.Record> tser = te.createSerializer(ec);
		assertEquals(1, ec.getDefaultKryoSerializerClasses().size());
		assertTrue(
				ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) &&
						ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(AvroKryoSerializerUtils.AvroSchemaSerializer.class));

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
			tser.serialize(rec, outView);
		}

		GenericData.Record newRec;
		try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(
				new ByteArrayInputStream(out.toByteArray()))) {
			newRec = tser.deserialize(inView);
		}

		// check if it is still the same
		assertNotNull(newRec);
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString());
		assertEquals("name not equal", TEST_NAME, newRec.get("name").toString());
		assertEquals(null, newRec.get("type_long_test"));
	}
}
 
Example 27
Source Project: iceberg   Source File: AvroIterable.java    License: Apache License 2.0 4 votes vote down vote up
AvroReuseIterator(FileReader<D> reader) {
  this.reader = reader;
}
 
Example 28
Source Project: iceberg   Source File: AvroIterable.java    License: Apache License 2.0 4 votes vote down vote up
AvroReuseIterator(FileReader<D> reader) {
  this.reader = reader;
}
 
Example 29
Source Project: datacollector   Source File: AvroConversionBaseMapper.java    License: Apache License 2.0 4 votes vote down vote up
@Override
protected void map(String input, String output, Context context) throws IOException, InterruptedException {
  FileSystem fs = FileSystem.get(context.getConfiguration());
  Configuration conf = context.getConfiguration();

  LOG.info("Converting input file: {}", input);
  LOG.info("Output directory: {}", output);
  Path inputPath = new Path(input);
  Path outputDir = new Path(output);
  fs.mkdirs(outputDir);

  Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName());
  if(fs.exists(tempFile)) {
    if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) {
      fs.delete(tempFile, true);
    } else {
      throw new IOException("Temporary file " + tempFile + " already exists.");
    }
  }
  LOG.info("Using temp file: {}", tempFile);

  // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc
  String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix();
  Path finalFile = new Path(outputDir, outputFileName);
  LOG.info("Final path will be: {}", finalFile);

  // Avro reader
  SeekableInput seekableInput = new FsInput(inputPath, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader);
  Schema avroSchema = fileReader.getSchema();

  initializeWriter(tempFile, avroSchema, conf, context);

  LOG.info("Started reading input file");
  long recordCount = 0;
  try {
    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();
      handleAvroRecord(record);

      context.getCounter(Counters.PROCESSED_RECORDS).increment(1);
      recordCount++;
    }
  } catch (Exception e) {
    // Various random stuff can happen while converting, so we wrap the underlying exception with more details
    String message = String.format(
        "Exception at offset %d (record %d): %s",
        fileReader.tell(),
        recordCount,
        e.toString()
    );
    throw new IOException(message, e);
  }
  LOG.info("Done reading input file");
  closeWriter();

  LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile);
  fs.rename(tempFile, finalFile);

  if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) {
    LOG.info("Removing input file", inputPath);
    fs.delete(inputPath, true);
  }

  LOG.info("Done converting input file into output directory {}", output);
}
 
Example 30
Source Project: flink   Source File: AvroRecordInputFormatTest.java    License: Apache License 2.0 4 votes vote down vote up
/**
 * Test if the Flink serialization is able to properly process GenericData.Record types.
 * Usually users of Avro generate classes (POJOs) from Avro schemas.
 * However, if generated classes are not available, one can also use GenericData.Record.
 * It is an untyped key-value record which is using a schema to validate the correctness of the data.
 *
 * <p>It is not recommended to use GenericData.Record with Flink. Use generated POJOs instead.
 */
@Test
public void testDeserializeToGenericType() throws IOException {
	DatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(userSchema);

	try (FileReader<GenericData.Record> dataFileReader = DataFileReader.openReader(testFile, datumReader)) {
		// initialize Record by reading it from disk (that's easier than creating it by hand)
		GenericData.Record rec = new GenericData.Record(userSchema);
		dataFileReader.next(rec);

		// check if record has been read correctly
		assertNotNull(rec);
		assertEquals("name not equal", TEST_NAME, rec.get("name").toString());
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), rec.get("type_enum").toString());
		assertEquals(null, rec.get("type_long_test")); // it is null for the first record.

		// now serialize it with our framework:
		TypeInformation<GenericData.Record> te = TypeExtractor.createTypeInfo(GenericData.Record.class);

		ExecutionConfig ec = new ExecutionConfig();
		assertEquals(GenericTypeInfo.class, te.getClass());

		Serializers.recursivelyRegisterType(te.getTypeClass(), ec, new HashSet<>());

		TypeSerializer<GenericData.Record> tser = te.createSerializer(ec);
		assertEquals(1, ec.getDefaultKryoSerializerClasses().size());
		assertTrue(
				ec.getDefaultKryoSerializerClasses().containsKey(Schema.class) &&
						ec.getDefaultKryoSerializerClasses().get(Schema.class).equals(AvroKryoSerializerUtils.AvroSchemaSerializer.class));

		ByteArrayOutputStream out = new ByteArrayOutputStream();
		try (DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(out)) {
			tser.serialize(rec, outView);
		}

		GenericData.Record newRec;
		try (DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(
				new ByteArrayInputStream(out.toByteArray()))) {
			newRec = tser.deserialize(inView);
		}

		// check if it is still the same
		assertNotNull(newRec);
		assertEquals("enum not equal", TEST_ENUM_COLOR.toString(), newRec.get("type_enum").toString());
		assertEquals("name not equal", TEST_NAME, newRec.get("name").toString());
		assertEquals(null, newRec.get("type_long_test"));
	}
}