Java Code Examples for org.apache.avro.file.DataFileConstants#NULL_CODEC

The following examples show how to use org.apache.avro.file.DataFileConstants#NULL_CODEC . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroSource.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Decodes a byte array as an InputStream. The byte array may be compressed using some codec.
 * Reads from the returned stream will result in decompressed bytes.
 *
 * <p>This supports the same codecs as Avro's {@link CodecFactory}, namely those defined in
 * {@link DataFileConstants}.
 *
 * <ul>
 *   <li>"snappy" : Google's Snappy compression
 *   <li>"deflate" : deflate compression
 *   <li>"bzip2" : Bzip2 compression
 *   <li>"xz" : xz compression
 *   <li>"null" (the string, not the value): Uncompressed data
 * </ul>
 */
private static InputStream decodeAsInputStream(byte[] data, String codec) throws IOException {
  ByteArrayInputStream byteStream = new ByteArrayInputStream(data);
  switch (codec) {
    case DataFileConstants.SNAPPY_CODEC:
      return new SnappyCompressorInputStream(byteStream, 1 << 16 /* Avro uses 64KB blocks */);
    case DataFileConstants.DEFLATE_CODEC:
      // nowrap == true: Do not expect ZLIB header or checksum, as Avro does not write them.
      Inflater inflater = new Inflater(true);
      return new InflaterInputStream(byteStream, inflater);
    case DataFileConstants.XZ_CODEC:
      return new XZCompressorInputStream(byteStream);
    case DataFileConstants.BZIP2_CODEC:
      return new BZip2CompressorInputStream(byteStream);
    case DataFileConstants.NULL_CODEC:
      return byteStream;
    default:
      throw new IllegalArgumentException("Unsupported codec: " + codec);
  }
}
 
Example 2
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadWithDifferentCodecs() throws Exception {
  // Test reading files generated using all codecs.
  String[] codecs = {
    DataFileConstants.NULL_CODEC,
    DataFileConstants.BZIP2_CODEC,
    DataFileConstants.DEFLATE_CODEC,
    DataFileConstants.SNAPPY_CODEC,
    DataFileConstants.XZ_CODEC,
  };
  // As Avro's default block size is 64KB, write 64K records to ensure at least one full block.
  // We could make this smaller than 64KB assuming each record is at least B bytes, but then the
  // test could silently stop testing the failure condition from BEAM-422.
  List<Bird> expected = createRandomRecords(1 << 16);

  for (String codec : codecs) {
    String filename =
        generateTestFile(
            codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
    AvroSource<Bird> source = AvroSource.from(filename).withSchema(Bird.class);
    List<Bird> actual = SourceTestUtils.readFromSource(source, null);
    assertThat(expected, containsInAnyOrder(actual.toArray()));
  }
}
 
Example 3
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadMetadataWithCodecs() throws Exception {
  // Test reading files generated using all codecs.
  String[] codecs = {
    DataFileConstants.NULL_CODEC,
    DataFileConstants.BZIP2_CODEC,
    DataFileConstants.DEFLATE_CODEC,
    DataFileConstants.SNAPPY_CODEC,
    DataFileConstants.XZ_CODEC
  };
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);

  for (String codec : codecs) {
    String filename =
        generateTestFile(
            codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);

    Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
    AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
    assertEquals(codec, metadata.getCodec());
  }
}
 
Example 4
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromMetadata() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);

  AvroSource<GenericRecord> source = AvroSource.from(fileMeta);
  AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class);
  AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234);

  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode());
  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode());
  assertEquals(
      FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode());
}
 
Example 5
Source File: AvroSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSchemaString() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);
  AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId());
  // By default, parse validates the schema, which is what we want.
  Schema schema = new Schema.Parser().parse(metadata.getSchemaString());
  assertEquals(4, schema.getFields().size());
}
 
Example 6
Source File: AvroHdfsFileSink.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
        FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
        Schema schema = null;
        String inputCodec = null;
        OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
        for (FileStatus sourceStatus : sourceStatuses) {
            try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(
                    new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {

                if (schema == null) {
                    schema = reader.getSchema();
                    for (String key : reader.getMetaKeys()) {
                        if (!DataFileWriter.isReservedMeta(key)) {
                            writer.setMeta(key, reader.getMeta(key));
                        }
                    }
                    inputCodec = reader.getMetaString(DataFileConstants.CODEC);
                    if (inputCodec == null) {
                        inputCodec = DataFileConstants.NULL_CODEC;
                    }
                    writer.setCodec(CodecFactory.fromString(inputCodec));
                    writer.create(schema, output);
                }
                writer.appendAllFrom(reader, false);
            }
        }
    }
}
 
Example 7
Source File: AvroSource.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the {@link AvroMetadata} from the header of an Avro file.
 *
 * <p>This method parses the header of an Avro <a
 * href="https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files">Object Container
 * File</a>.
 *
 * @throws IOException if the file is an invalid format.
 */
@VisibleForTesting
static AvroMetadata readMetadataFromFile(ResourceId fileResource) throws IOException {
  String codec = null;
  String schemaString = null;
  byte[] syncMarker;
  try (InputStream stream = Channels.newInputStream(FileSystems.open(fileResource))) {
    BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(stream, null);

    // The header of an object container file begins with a four-byte magic number, followed
    // by the file metadata (including the schema and codec), encoded as a map. Finally, the
    // header ends with the file's 16-byte sync marker.
    // See https://avro.apache.org/docs/1.7.7/spec.html#Object+Container+Files for details on
    // the encoding of container files.

    // Read the magic number.
    byte[] magic = new byte[DataFileConstants.MAGIC.length];
    decoder.readFixed(magic);
    if (!Arrays.equals(magic, DataFileConstants.MAGIC)) {
      throw new IOException("Missing Avro file signature: " + fileResource);
    }

    // Read the metadata to find the codec and schema.
    ByteBuffer valueBuffer = ByteBuffer.allocate(512);
    long numRecords = decoder.readMapStart();
    while (numRecords > 0) {
      for (long recordIndex = 0; recordIndex < numRecords; recordIndex++) {
        String key = decoder.readString();
        // readBytes() clears the buffer and returns a buffer where:
        // - position is the start of the bytes read
        // - limit is the end of the bytes read
        valueBuffer = decoder.readBytes(valueBuffer);
        byte[] bytes = new byte[valueBuffer.remaining()];
        valueBuffer.get(bytes);
        if (key.equals(DataFileConstants.CODEC)) {
          codec = new String(bytes, StandardCharsets.UTF_8);
        } else if (key.equals(DataFileConstants.SCHEMA)) {
          schemaString = new String(bytes, StandardCharsets.UTF_8);
        }
      }
      numRecords = decoder.mapNext();
    }
    if (codec == null) {
      codec = DataFileConstants.NULL_CODEC;
    }

    // Finally, read the sync marker.
    syncMarker = new byte[DataFileConstants.SYNC_SIZE];
    decoder.readFixed(syncMarker);
  }
  checkState(schemaString != null, "No schema present in Avro file metadata %s", fileResource);
  return new AvroMetadata(syncMarker, codec, schemaString);
}