org.apache.avro.file.DataFileConstants Java Examples

The following examples show how to use org.apache.avro.file.DataFileConstants. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroKeyValueSinkWriter.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private CodecFactory getCompressionCodec(Map<String, String> conf) {
	if (getBoolean(conf, CONF_COMPRESS, false)) {
		int deflateLevel = getInt(conf, CONF_DEFLATE_LEVEL, CodecFactory.DEFAULT_DEFLATE_LEVEL);
		int xzLevel = getInt(conf, CONF_XZ_LEVEL, CodecFactory.DEFAULT_XZ_LEVEL);

		String outputCodec = conf.get(CONF_COMPRESS_CODEC);

		if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) {
			return CodecFactory.deflateCodec(deflateLevel);
		} else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) {
			return CodecFactory.xzCodec(xzLevel);
		} else {
			return CodecFactory.fromString(outputCodec);
		}
	}
	return CodecFactory.nullCodec();
}
 
Example #2
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromMetadata() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);

  AvroSource<GenericRecord> source = AvroSource.from(fileMeta);
  AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class);
  AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234);

  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode());
  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode());
  assertEquals(
      FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode());
}
 
Example #3
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadMetadataWithCodecs() throws Exception {
  // Test reading files generated using all codecs.
  String[] codecs = {
    DataFileConstants.NULL_CODEC,
    DataFileConstants.BZIP2_CODEC,
    DataFileConstants.DEFLATE_CODEC,
    DataFileConstants.SNAPPY_CODEC,
    DataFileConstants.XZ_CODEC
  };
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);

  for (String codec : codecs) {
    String filename =
        generateTestFile(
            codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);

    Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
    AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
    assertEquals(codec, metadata.getCodec());
  }
}
 
Example #4
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testParseFn() throws Exception {
  List<Bird> expected = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          expected,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<Bird> source =
      AvroSource.from(filename)
          .withParseFn(
              input ->
                  new Bird(
                      (long) input.get("number"),
                      input.get("species").toString(),
                      input.get("quality").toString(),
                      (long) input.get("quantity")),
              AvroCoder.of(Bird.class));
  List<Bird> actual = SourceTestUtils.readFromSource(source, null);
  assertThat(actual, containsInAnyOrder(expected.toArray()));
}
 
Example #5
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSchemaStringIsInterned() throws Exception {
  List<Bird> birds = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          birds,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);
  Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
  String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
  // Add "" to the schema to make sure it is not interned.
  AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema);
  AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema);
  assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString());

  // Ensure that deserialization still goes through interning
  AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB);
  assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString());
}
 
Example #6
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSchemaUpdate() throws Exception {
  List<Bird> birds = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          birds,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<FancyBird> source = AvroSource.from(filename).withSchema(FancyBird.class);
  List<FancyBird> actual = SourceTestUtils.readFromSource(source, null);

  List<FancyBird> expected = new ArrayList<>();
  for (Bird bird : birds) {
    expected.add(
        new FancyBird(
            bird.number, bird.species, bird.quality, bird.quantity, null, "MAXIMUM OVERDRIVE"));
  }

  assertThat(actual, containsInAnyOrder(expected.toArray()));
}
 
Example #7
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreationWithSchema() throws Exception {
  List<Bird> expected = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          expected,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);

  // Create a source with a schema object
  Schema schema = ReflectData.get().getSchema(Bird.class);
  AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema);
  List<GenericRecord> records = SourceTestUtils.readFromSource(source, null);
  assertEqualsWithGeneric(expected, records);

  // Create a source with a JSON schema
  String schemaString = ReflectData.get().getSchema(Bird.class).toString();
  source = AvroSource.from(filename).withSchema(schemaString);
  records = SourceTestUtils.readFromSource(source, null);
  assertEqualsWithGeneric(expected, records);
}
 
Example #8
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultipleFiles() throws Exception {
  String baseName = "tmp-";
  List<Bird> expected = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10);
    expected.addAll(contents);
    generateTestFile(
        baseName + i,
        contents,
        SyncBehavior.SYNC_DEFAULT,
        0,
        AvroCoder.of(Bird.class),
        DataFileConstants.NULL_CODEC);
  }

  AvroSource<Bird> source =
      AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString())
          .withSchema(Bird.class);
  List<Bird> actual = SourceTestUtils.readFromSource(source, null);
  assertThat(actual, containsInAnyOrder(expected.toArray()));
}
 
Example #9
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSplitAtFractionExhaustive() throws Exception {
  // A small-sized input is sufficient, because the test verifies that splitting is non-vacuous.
  List<FixedRecord> expected = createFixedRecords(20);
  String filename =
      generateTestFile(
          "tmp.avro",
          expected,
          SyncBehavior.SYNC_REGULAR,
          5,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  SourceTestUtils.assertSplitAtFractionExhaustive(source, null);
}
 
Example #10
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetCurrentFromUnstartedReader() throws Exception {
  List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_DEFAULT,
          1000,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BlockBasedSource.BlockBasedReader<FixedRecord> reader =
      (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) {
    assertEquals(null, reader.getCurrentBlock());

    expectedException.expect(NoSuchElementException.class);
    expectedException.expectMessage("No block has been successfully read from");
    reader.getCurrent();
  }
}
 
Example #11
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetProgressFromUnstartedReader() throws Exception {
  List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_DEFAULT,
          1000,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);
  File file = new File(filename);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) {
    assertEquals(Double.valueOf(0.0), reader.getFractionConsumed());
  }

  List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null);
  for (BoundedSource<FixedRecord> subSource : splits) {
    try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) {
      assertEquals(Double.valueOf(0.0), reader.getFractionConsumed());
    }
  }
}
 
Example #12
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadWithDifferentCodecs() throws Exception {
  // Test reading files generated using all codecs.
  String[] codecs = {
    DataFileConstants.NULL_CODEC,
    DataFileConstants.BZIP2_CODEC,
    DataFileConstants.DEFLATE_CODEC,
    DataFileConstants.SNAPPY_CODEC,
    DataFileConstants.XZ_CODEC,
  };
  // As Avro's default block size is 64KB, write 64K records to ensure at least one full block.
  // We could make this smaller than 64KB assuming each record is at least B bytes, but then the
  // test could silently stop testing the failure condition from BEAM-422.
  List<Bird> expected = createRandomRecords(1 << 16);

  for (String codec : codecs) {
    String filename =
        generateTestFile(
            codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
    AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class);
    List<Bird> actual = SourceTestUtils.readFromSource(source, null);
    assertThat(expected, containsInAnyOrder(actual.toArray()));
  }
}
 
Example #13
Source File: AvroSource.java    From beam with Apache License 2.0 6 votes vote down vote up
AvroBlock(byte[] data, long numRecords, Mode<T> mode, String writerSchemaString, String codec)
    throws IOException {
  this.mode = mode;
  this.numRecords = numRecords;
  checkNotNull(writerSchemaString, "writerSchemaString");
  Schema writerSchema = internOrParseSchemaString(writerSchemaString);
  Schema readerSchema =
      internOrParseSchemaString(
          MoreObjects.firstNonNull(mode.readerSchemaString, writerSchemaString));

  this.reader = mode.createReader(writerSchema, readerSchema);

  if (codec.equals(DataFileConstants.NULL_CODEC)) {
    // Avro can read from a byte[] using a more efficient implementation.  If the input is not
    // compressed, pass the data in directly.
    this.decoder = DecoderFactory.get().binaryDecoder(data, null);
  } else {
    this.decoder = DecoderFactory.get().binaryDecoder(decodeAsInputStream(data, codec), null);
  }
}
 
Example #14
Source File: AvroSource.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Decodes a byte array as an InputStream. The byte array may be compressed using some codec.
 * Reads from the returned stream will result in decompressed bytes.
 *
 * <p>This supports the same codecs as Avro's {@link CodecFactory}, namely those defined in
 * {@link DataFileConstants}.
 *
 * <ul>
 *   <li>"snappy" : Google's Snappy compression
 *   <li>"deflate" : deflate compression
 *   <li>"bzip2" : Bzip2 compression
 *   <li>"xz" : xz compression
 *   <li>"null" (the string, not the value): Uncompressed data
 * </ul>
 */
private static InputStream decodeAsInputStream(byte[] data, String codec) throws IOException {
  ByteArrayInputStream byteStream = new ByteArrayInputStream(data);
  switch (codec) {
    case DataFileConstants.SNAPPY_CODEC:
      return new SnappyCompressorInputStream(byteStream, 1 << 16 /* Avro uses 64KB blocks */);
    case DataFileConstants.DEFLATE_CODEC:
      // nowrap == true: Do not expect ZLIB header or checksum, as Avro does not write them.
      Inflater inflater = new Inflater(true);
      return new InflaterInputStream(byteStream, inflater);
    case DataFileConstants.XZ_CODEC:
      return new XZCompressorInputStream(byteStream);
    case DataFileConstants.BZIP2_CODEC:
      return new BZip2CompressorInputStream(byteStream);
    case DataFileConstants.NULL_CODEC:
      return byteStream;
    default:
      throw new IllegalArgumentException("Unsupported codec: " + codec);
  }
}
 
Example #15
Source File: AvroKeyValueSinkWriter.java    From flink with Apache License 2.0 6 votes vote down vote up
private CodecFactory getCompressionCodec(Map<String, String> conf) {
	if (getBoolean(conf, CONF_COMPRESS, false)) {
		int deflateLevel = getInt(conf, CONF_DEFLATE_LEVEL, CodecFactory.DEFAULT_DEFLATE_LEVEL);
		int xzLevel = getInt(conf, CONF_XZ_LEVEL, CodecFactory.DEFAULT_XZ_LEVEL);

		String outputCodec = conf.get(CONF_COMPRESS_CODEC);

		if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) {
			return CodecFactory.deflateCodec(deflateLevel);
		} else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) {
			return CodecFactory.xzCodec(xzLevel);
		} else {
			return CodecFactory.fromString(outputCodec);
		}
	}
	return CodecFactory.nullCodec();
}
 
Example #16
Source File: LobAvroImportTestCase.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 6 votes vote down vote up
/**
 * Import blob data that is smaller than inline lob limit and compress with
 * deflate codec. Blob data should be encoded and saved as Avro bytes.
 * @throws IOException
 * @throws SQLException
 */
public void testBlobCompressedAvroImportInline()
    throws IOException, SQLException {
  String [] types = { getBlobType() };
  String expectedVal = "This is short BLOB data";
  String [] vals = { getBlobInsertStr(expectedVal) };

  createTableWithColTypes(types, vals);

  runImport(getArgv("--compression-codec", CodecMap.DEFLATE));

  Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
  DataFileReader<GenericRecord> reader = read(outputFile);
  GenericRecord record = reader.next();

  // Verify that the data block of the Avro file is compressed with deflate
  // codec.
  assertEquals(CodecMap.DEFLATE,
      reader.getMetaString(DataFileConstants.CODEC));

  // Verify that all columns are imported correctly.
  ByteBuffer buf = (ByteBuffer) record.get(getColName(0));
  String returnVal = new String(buf.array());

  assertEquals(getColName(0), expectedVal, returnVal);
}
 
Example #17
Source File: AvroKeyValueSinkWriterTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testDuplicate() {
	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.STRING);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	AvroKeyValueSinkWriter<String, String> writer = new AvroKeyValueSinkWriter(properties);
	writer.setSyncOnFlush(true);
	AvroKeyValueSinkWriter<String, String> other = writer.duplicate();

	assertTrue(StreamWriterBaseComparator.equals(writer, other));

	writer.setSyncOnFlush(false);
	assertFalse(StreamWriterBaseComparator.equals(writer, other));
}
 
Example #18
Source File: AvroKeyValueSinkWriter.java    From flink with Apache License 2.0 6 votes vote down vote up
private CodecFactory getCompressionCodec(Map<String, String> conf) {
	if (getBoolean(conf, CONF_COMPRESS, false)) {
		int deflateLevel = getInt(conf, CONF_DEFLATE_LEVEL, CodecFactory.DEFAULT_DEFLATE_LEVEL);
		int xzLevel = getInt(conf, CONF_XZ_LEVEL, CodecFactory.DEFAULT_XZ_LEVEL);

		String outputCodec = conf.get(CONF_COMPRESS_CODEC);

		if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) {
			return CodecFactory.deflateCodec(deflateLevel);
		} else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) {
			return CodecFactory.xzCodec(xzLevel);
		} else {
			return CodecFactory.fromString(outputCodec);
		}
	}
	return CodecFactory.nullCodec();
}
 
Example #19
Source File: AvroKeyValueSinkWriterTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testDuplicate() {
	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.STRING);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	AvroKeyValueSinkWriter<String, String> writer = new AvroKeyValueSinkWriter(properties);
	writer.setSyncOnFlush(true);
	AvroKeyValueSinkWriter<String, String> other = writer.duplicate();

	assertTrue(StreamWriterBaseComparator.equals(writer, other));

	writer.setSyncOnFlush(false);
	assertFalse(StreamWriterBaseComparator.equals(writer, other));
}
 
Example #20
Source File: AvroKeyValueSinkWriterTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testDuplicate() {
	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.STRING);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	AvroKeyValueSinkWriter<String, String> writer = new AvroKeyValueSinkWriter(properties);
	writer.setSyncOnFlush(true);
	AvroKeyValueSinkWriter<String, String> other = writer.duplicate();

	assertTrue(StreamWriterBaseComparator.equals(writer, other));

	writer.setSyncOnFlush(false);
	assertFalse(StreamWriterBaseComparator.equals(writer, other));
}
 
Example #21
Source File: AvroSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testSplitAtFraction() throws Exception {
  // A reduced dataset is enough here.
  List<FixedRecord> expected = createFixedRecords(DEFAULT_RECORD_COUNT);
  // Create an AvroSource where each block is 1/10th of the total set of records.
  String filename =
      generateTestFile(
          "tmp.avro",
          expected,
          SyncBehavior.SYNC_REGULAR,
          DEFAULT_RECORD_COUNT / 10 /* max records per block */,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);
  File file = new File(filename);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null);
  for (BoundedSource<FixedRecord> subSource : splits) {
    int items = SourceTestUtils.readFromSource(subSource, null).size();
    // Shouldn't split while unstarted.
    SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.0, null);
    SourceTestUtils.assertSplitAtFractionFails(subSource, 0, 0.7, null);
    SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, 1, 0.7, null);
    SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(
        subSource, DEFAULT_RECORD_COUNT / 100, 0.7, null);
    SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(
        subSource, DEFAULT_RECORD_COUNT / 10, 0.1, null);
    SourceTestUtils.assertSplitAtFractionFails(
        subSource, DEFAULT_RECORD_COUNT / 10 + 1, 0.1, null);
    SourceTestUtils.assertSplitAtFractionFails(subSource, DEFAULT_RECORD_COUNT / 3, 0.3, null);
    SourceTestUtils.assertSplitAtFractionFails(subSource, items, 0.9, null);
    SourceTestUtils.assertSplitAtFractionFails(subSource, items, 1.0, null);
    SourceTestUtils.assertSplitAtFractionSucceedsAndConsistent(subSource, items, 0.999, null);
  }
}
 
Example #22
Source File: Purge.java    From Cubert with Apache License 2.0 5 votes vote down vote up
private DataFileWriter<GenericRecord> createDataFileWriter(DataFileReader<GenericRecord> dataFileReader) throws IllegalArgumentException,
        IOException
{
    Schema schema = dataFileReader.getSchema();
    DatumWriter<GenericRecord> datumWriter =
            new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> writer =
            new DataFileWriter<GenericRecord>(datumWriter);

    // Get the codec of the reader
    String codecStr = dataFileReader.getMetaString(DataFileConstants.CODEC);
    int level = conf.getInt("avro.mapred.deflate.level", 1);
    String codecName = conf.get("avro.output.codec", codecStr);
    CodecFactory factory =
            codecName.equals("deflate") ? CodecFactory.deflateCodec(level)
                    : CodecFactory.fromString(codecName);

    // Set the codec of the writer
    writer.setCodec(factory);

    writer.setSyncInterval(conf.getInt("avro.mapred.sync.interval",
                                       Math.max(conf.getInt("io.file.buffer.size",
                                                            16000), 16000)));

    writer.create(schema,
                  new Path(tempFileName).getFileSystem(conf)
                                        .create(new Path(tempFileName)));
    return writer;
}
 
Example #23
Source File: AvroSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testProgressEmptySource() throws Exception {
  // 0 records, 20 per block.
  List<FixedRecord> records = Collections.emptyList();
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_REGULAR,
          2,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BoundedSource.BoundedReader<FixedRecord> readerOrig = source.createReader(null)) {
    assertThat(readerOrig, Matchers.instanceOf(BlockBasedReader.class));
    BlockBasedReader<FixedRecord> reader = (BlockBasedReader<FixedRecord>) readerOrig;

    // before starting
    assertEquals(0.0, reader.getFractionConsumed(), 1e-6);
    assertEquals(0, reader.getSplitPointsConsumed());
    assertEquals(BoundedReader.SPLIT_POINTS_UNKNOWN, reader.getSplitPointsRemaining());

    // confirm empty
    assertFalse(reader.start());

    // after reading empty source
    assertEquals(0, reader.getSplitPointsConsumed());
    assertEquals(0, reader.getSplitPointsRemaining());
    assertEquals(1.0, reader.getFractionConsumed(), 1e-6);
  }
}
 
Example #24
Source File: WriterUtils.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a {@link CodecFactory} based on the specified codec name and deflate level. If codecName is absent, then
 * a {@link CodecFactory#deflateCodec(int)} is returned. Otherwise the codecName is converted into a
 * {@link CodecFactory} via the {@link CodecFactory#fromString(String)} method.
 *
 * @param codecName the name of the codec to use (e.g. deflate, snappy, xz, etc.).
 * @param deflateLevel must be an integer from [0-9], and is only applicable if the codecName is "deflate".
 * @return a {@link CodecFactory}.
 */
public static CodecFactory getCodecFactory(Optional<String> codecName, Optional<String> deflateLevel) {
  if (!codecName.isPresent()) {
    return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL);
  } else if (codecName.get().equalsIgnoreCase(DataFileConstants.DEFLATE_CODEC)) {
    if (!deflateLevel.isPresent()) {
      return CodecFactory.deflateCodec(ConfigurationKeys.DEFAULT_DEFLATE_LEVEL);
    }
    return CodecFactory.deflateCodec(Integer.parseInt(deflateLevel.get()));
  } else {
    return CodecFactory.fromString(codecName.get().toLowerCase());
  }
}
 
Example #25
Source File: ImportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  KV<String, String> kv = c.element();

  String schema = null;
  ResourceId resourceId = FileSystems.matchNewResource(kv.getValue(), false);
  try (InputStream stream = Channels.newInputStream(FileSystems.open(resourceId))) {
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);
    byte[] magic = new byte[DataFileConstants.MAGIC.length];
    decoder.readFixed(magic);
    if (!Arrays.equals(magic, DataFileConstants.MAGIC)) {
      throw new IOException("Missing Avro file signature: " + kv.getValue());
    }

    // Read the metadata to find the codec and schema.
    ByteBuffer valueBuffer = ByteBuffer.allocate(512);
    long numRecords = decoder.readMapStart();
    while (numRecords > 0 && schema == null) {
      for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) {
        String key = decoder.readString();
        // readBytes() clears the buffer and returns a buffer where:
        // - position is the start of the bytes read
        // - limit is the end of the bytes read
        valueBuffer = decoder.readBytes(valueBuffer);
        byte[] bytes = new byte[valueBuffer.remaining()];
        valueBuffer.get(bytes);
        if (key.equals(DataFileConstants.SCHEMA)) {
          schema = new String(bytes, "UTF-8");
          break;
        }
      }
      numRecords = decoder.mapNext();
    }
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
  c.output(KV.of(kv.getKey(), schema));
}
 
Example #26
Source File: TestExecuteSQL.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testCompression() throws SQLException, CompressorException, IOException {
    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();

    // load test data to database
    final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();

    try {
        stmt.execute("drop table TEST_NULL_INT");
    } catch (final SQLException sqle) {
    }

    stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))");

    stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)");

    runner.setIncomingConnection(false);
    runner.setProperty(ExecuteSQL.COMPRESSION_FORMAT, AvroUtil.CodecType.BZIP2.name());
    runner.setProperty(ExecuteSQL.SQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT");
    runner.run();

    runner.assertAllFlowFilesTransferred(ExecuteSQL.REL_SUCCESS, 1);

    MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExecuteSQL.REL_SUCCESS).get(0);

    try (DataFileStream<GenericRecord> dfs = new DataFileStream<>(new ByteArrayInputStream(flowFile.toByteArray()), new GenericDatumReader<GenericRecord>())) {
        assertEquals(AvroUtil.CodecType.BZIP2.name().toLowerCase(), dfs.getMetaString(DataFileConstants.CODEC).toLowerCase());
    }
}
 
Example #27
Source File: AvroHdfsFileSink.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
        FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
        Schema schema = null;
        String inputCodec = null;
        OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
        for (FileStatus sourceStatus : sourceStatuses) {
            try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(
                    new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {

                if (schema == null) {
                    schema = reader.getSchema();
                    for (String key : reader.getMetaKeys()) {
                        if (!DataFileWriter.isReservedMeta(key)) {
                            writer.setMeta(key, reader.getMeta(key));
                        }
                    }
                    inputCodec = reader.getMetaString(DataFileConstants.CODEC);
                    if (inputCodec == null) {
                        inputCodec = DataFileConstants.NULL_CODEC;
                    }
                    writer.setCodec(CodecFactory.fromString(inputCodec));
                    writer.create(schema, output);
                }
                writer.appendAllFrom(reader, false);
            }
        }
    }
}
 
Example #28
Source File: AvroSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSchemaString() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
  AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
  // By default, parse validates the schema, which is what we want.
  Schema schema = new Schema.Parser().parse(metadata.getSchemaString());
  assertEquals(4, schema.getFields().size());
}
 
Example #29
Source File: AvroHdfsFileSink.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected void configure(Job job, KV<AvroKey<IndexedRecord>, NullWritable> sample) {
    super.configure(job, sample);
    AvroKey<IndexedRecord> k = sample.getKey();
    AvroJob.setOutputKeySchema(job, k.datum().getSchema());
    FileOutputFormat.setCompressOutput(job, true);
    job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.SNAPPY_CODEC);
}
 
Example #30
Source File: BucketingSinkTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * This tests {@link AvroKeyValueSinkWriter}
 * with non-rolling output and with compression.
 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
	final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";

	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}