org.apache.hadoop.io.compress.BZip2Codec Java Examples

The following examples show how to use org.apache.hadoop.io.compress.BZip2Codec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinTransactionInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinTransaction> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
int transactCount=0;
while (reader.nextKeyValue()) {
	transactCount++;
}
	assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions");
reader.close();
 }

Example #2

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }

Example #3

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinTransactionInputFormatBzip2Compressed() throws IOException {
     JobConf job = new JobConf(defaultConf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinTransaction> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BitcoinTransaction transaction = new BitcoinTransaction();
int transactCount=0;
while (reader.next(key,transaction)) {
	transactCount++;
}
	assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions");
reader.close();
 }

Example #4

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException {
   JobConf job = new JobConf(defaultConf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.next(key,block),"Input Split for block version contains at least one block");
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
BytesWritable emptyKey = new BytesWritable();
   	BytesWritable emptyBlock = new BytesWritable();
   	assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block");
reader.close();
 }

Example #5

Source File: S3SelectPushdown.java From presto with Apache License 2.0

5 votes

public static boolean isCompressionCodecSupported(InputFormat<?, ?> inputFormat, Path path)
{
    if (inputFormat instanceof TextInputFormat) {
        return getCompressionCodec((TextInputFormat) inputFormat, path)
                .map(codec -> (codec instanceof GzipCodec) || (codec instanceof BZip2Codec))
                .orElse(false); // TODO (https://github.com/prestosql/presto/issues/2475) fix S3 Select when file not compressed
    }

    return false;
}

Example #6

Source File: MultiStorage.java From spork with Apache License 2.0

5 votes

@Override
public void setStoreLocation(String location, Job job) throws IOException {
  job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, "");
  FileOutputFormat.setOutputPath(job, new Path(location));
  if (comp == Compression.bz2 || comp == Compression.bz) {
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job,  BZip2Codec.class);
  } else if (comp == Compression.gz) {
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  }
}

Example #7

Source File: PigStorage.java From spork with Apache License 2.0

5 votes

private void setCompression(Path path, Job job) {
 	String location=path.getName();
    if (location.endsWith(".bz2") || location.endsWith(".bz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job,  BZip2Codec.class);
    }  else if (location.endsWith(".gz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {
        FileOutputFormat.setCompressOutput( job, false);
    }
}

Example #8

Source File: TestExtractor.java From sqoop-on-spark with Apache License 2.0

5 votes

@DataProvider(name="test-hdfs-extractor")
public static Object[][] data() {
  List<Object[]> parameters = new ArrayList<Object[]>();
  for (Class<?> compressionClass : new Class<?>[]{null, DefaultCodec.class, BZip2Codec.class}) {
    for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) {
      parameters.add(new Object[]{outputFileType, compressionClass});
    }
  }
  return parameters.toArray(new Object[0][]);
}

Example #9

Source File: TestPartitioner.java From sqoop-on-spark with Apache License 2.0

5 votes

@DataProvider(name="test-hdfs-partitioner")
public static Object[][] data() {
  List<Object[]> parameters = new ArrayList<Object[]>();
  for (Class<?> compressionClass : new Class<?>[]{null, DefaultCodec.class, BZip2Codec.class}) {
    for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) {
      parameters.add(new Object[]{outputFileType, compressionClass});
    }
  }
  return parameters.toArray(new Object[0][]);
}

Example #10

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
 public void readBitcoinBlockInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinBlockFileInputFormat format = new BitcoinBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinBlock> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BitcoinBlock block = new BitcoinBlock();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 936, block.getTransactions().size(),"Compressed block must have at least 936 transactions");
assertEquals( 4, block.getTransactions().get(0).getListOfInputs().get(0).getTxInScript().length,"Compressed block must contain exactly 936 transactions of which the first has one input and script length 4");
assertEquals( 2, block.getTransactions().get(0).getListOfOutputs().size(),"Compressed block must contain exactly 936 transactions of which the first has two outputs");
assertEquals( 25, block.getTransactions().get(0).getListOfOutputs().get(0).getTxOutScript().length,"Compressed block must contain exactly 936 transactions of which the first has two output and the first output script length 25");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }

Example #11

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
 public void readBitcoinBlockInputFormatBzip2Compressed() throws IOException {
   JobConf job = new JobConf(defaultConf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinBlockFileInputFormat format = new BitcoinBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BitcoinBlock block = new BitcoinBlock();
assertTrue( reader.next(key,block),"Input Split for block version contains at least one block");
assertEquals( 936, block.getTransactions().size(),"Compressed block must have at least 936 transactions");
assertEquals( 4, block.getTransactions().get(0).getListOfInputs().get(0).getTxInScript().length,"Compressed block must contain exactly 936 transactions of which the first has one input and script length 4");
assertEquals( 2, block.getTransactions().get(0).getListOfOutputs().size(),"Compressed block must contain exactly 936 transactions of which the first has two outputs");
assertEquals( 25, block.getTransactions().get(0).getListOfOutputs().get(0).getTxOutScript().length,"Compressed block must contain exactly 936 transactions of which the first has two output and the first output script length 25");
BytesWritable emptyKey = new BytesWritable();
   	BitcoinBlock emptyBlock = new BitcoinBlock();
   	assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block");
reader.close();
 }

Example #12

Source File: TestLineRecordReader.java From big-c with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

  // read the data and check whether BOM is skipped
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
  LineRecordReader reader = new LineRecordReader();
  reader.initialize(split, context);

  //noinspection StatementWithEmptyBody
  while (reader.nextKeyValue()) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Example #13

Source File: TestLineRecordReader.java From big-c with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize,
      (String[])null);

  LineRecordReader reader = new LineRecordReader(conf, split);
  LongWritable key = new LongWritable();
  Text value = new Text();
  //noinspection StatementWithEmptyBody
  while (reader.next(key, value)) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Example #14

Source File: TestLineRecordReader.java From hadoop with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

  // read the data and check whether BOM is skipped
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
  LineRecordReader reader = new LineRecordReader();
  reader.initialize(split, context);

  //noinspection StatementWithEmptyBody
  while (reader.nextKeyValue()) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Example #15

Source File: TestLineRecordReader.java From hadoop with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize,
      (String[])null);

  LineRecordReader reader = new LineRecordReader(conf, split);
  LongWritable key = new LongWritable();
  Text value = new Text();
  //noinspection StatementWithEmptyBody
  while (reader.next(key, value)) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Example #16

Source File: HdfsSink2.java From sylph with Apache License 2.0

5 votes

public HdfsSink2(Hdfs2SinkConfig config)
        throws ClassNotFoundException
{
    this.batchSize = config.getBatchBufferSize();
    this.writerDir = config.getWriteDir();
    switch (config.getZipType().trim().toLowerCase()) {
        case "lzo":
            codecClass = (Class<? extends CompressionCodec>) Class.forName("com.hadoop.compression.lzo.LzopCodec");
            break;
        case "lz4":
            codecClass = Lz4Codec.class;
            break;
        case "snappy":
            codecClass = SnappyCodec.class;
            break;
        case "gzip":
            codecClass = GzipCodec.class;
            break;
        case "bzip2":
            codecClass = BZip2Codec.class;
            break;
        case "default":
            codecClass = DefaultCodec.class;
            break;
        default:
            codecClass = NoneCodec.class;
    }
}

Example #17

Source File: CodecFactory.java From pxf with Apache License 2.0

5 votes

/**
 * Determine whether a given compression codec is safe for multiple concurrent threads
 *
 * @param compCodec     the user-given COMPRESSION_CODEC, may be null
 * @param dataSource    the file that we are accessing
 * @param configuration HDFS config
 * @return true only if it's thread safe
 */
public boolean isCodecThreadSafe(String compCodec, String dataSource, Configuration configuration) {
    Class<? extends CompressionCodec> codecClass = null;
    if (compCodec == null) {
        // check for file extensions indicating bzip2 (Text only)
        // currently doesn't check for bzip2 in .avro files
        codecClass = getCodecClassByPath(configuration, dataSource);
    }
    // make sure bzip2 is not the codec
    return !( "bzip2".equalsIgnoreCase(compCodec) ||
            BZip2Codec.class.getName().equalsIgnoreCase(compCodec) ||
            (codecClass != null && BZip2Codec.class.isAssignableFrom(codecClass))
    );
}

Example #18

Source File: AbstractHadoopProcessor.java From localization_nifi with Apache License 2.0

5 votes

@Override
public String toString() {
    switch (this) {
        case NONE: return "NONE";
        case DEFAULT: return DefaultCodec.class.getName();
        case BZIP: return BZip2Codec.class.getName();
        case GZIP: return GzipCodec.class.getName();
        case LZ4: return Lz4Codec.class.getName();
        case SNAPPY: return SnappyCodec.class.getName();
        case AUTOMATIC: return "Automatically Detected";
    }
    return null;
}

Example #19

Source File: S3SelectLineRecordReader.java From presto with Apache License 2.0

5 votes

protected CompressionType getCompressionType(Path path)
{
    CompressionCodec codec = compressionCodecFactory.getCodec(path);
    if (codec == null) {
        return CompressionType.NONE;
    }
    if (codec instanceof GzipCodec) {
        return CompressionType.GZIP;
    }
    if (codec instanceof BZip2Codec) {
        return CompressionType.BZIP2;
    }
    throw new PrestoException(NOT_SUPPORTED, "Compression extension not supported for S3 Select: " + path);
}

Example #20

Source File: TestCompression.java From aliyun-maxcompute-data-collectors with Apache License 2.0

4 votes

public void testBzip2SequenceFileCompression() throws Exception {
  runSequenceFileCompressionTest(new BZip2Codec(), 4);
}

Example #21

Source File: TestCompression.java From aliyun-maxcompute-data-collectors with Apache License 2.0

4 votes

public void testBzip2TextCompression() throws IOException {
  runTextCompressionTest(new BZip2Codec(), 4);
}

Example #22

Source File: TestCreateHadoopSequenceFile.java From localization_nifi with Apache License 2.0

4 votes

@Test
public void testSequenceFileBzipCompressionCodec() throws UnsupportedEncodingException, IOException {

    controller.setProperty(AbstractHadoopProcessor.COMPRESSION_CODEC, AbstractHadoopProcessor.CompressionType.BZIP.name());
    controller.setProperty(CreateHadoopSequenceFile.COMPRESSION_TYPE, SequenceFile.CompressionType.BLOCK.name());

    File inFile = inFiles[0];
    try (FileInputStream fin = new FileInputStream(inFile) ){
        controller.enqueue(fin);
    }
    controller.run();

    List<MockFlowFile> successSeqFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_SUCCESS);
    List<MockFlowFile> failedFlowFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_FAILURE);

    assertEquals(0, failedFlowFiles.size());
    assertEquals(1, successSeqFiles.size());

    MockFlowFile ff = successSeqFiles.iterator().next();
    byte[] data = ff.toByteArray();


    final String magicHeader = new String(data, 0, 3, "UTF-8");
    assertEquals("SEQ", magicHeader);
    // Format of header is SEQ followed by the version (1 byte).
    // Then, the length of the Key type (1 byte), then the Key type
    // Then, the length of the Value type(1 byte), then the Value type
    final String keyType = Text.class.getCanonicalName();
    final int valueTypeStart = 3 + 1 + 1 + keyType.length() + 1;
    final int valueTypeLength = data[5 + keyType.length()];
    final String valueType = BytesWritable.class.getCanonicalName();

    assertEquals(valueType.length(), valueTypeLength);
    assertEquals(valueType, new String(data, valueTypeStart, valueType.length(), "UTF-8"));

    final int compressionIndex = 3 + 1 + 1 + keyType.length() + 1 + valueType.length();
    final int blockCompressionIndex = compressionIndex + 1;

    assertEquals(1, data[compressionIndex]);
    assertEquals(1, data[blockCompressionIndex]);

    final int codecTypeSize = data[blockCompressionIndex + 1];
    final int codecTypeStartIndex = blockCompressionIndex + 2;

    assertEquals(BZip2Codec.class.getCanonicalName(), new String(data, codecTypeStartIndex, codecTypeSize, "UTF-8"));
}

Example #23

Source File: RcFileTester.java From presto with Apache License 2.0

4 votes

@Override
Optional<String> getCodecName()
{
    return Optional.of(BZip2Codec.class.getName());
}

Example #24

Source File: TestCreateHadoopSequenceFile.java From nifi with Apache License 2.0

4 votes

@Test
public void testSequenceFileBzipCompressionCodec() throws UnsupportedEncodingException, IOException {

    controller.setProperty(AbstractHadoopProcessor.COMPRESSION_CODEC, CompressionType.BZIP.name());
    controller.setProperty(CreateHadoopSequenceFile.COMPRESSION_TYPE, SequenceFile.CompressionType.BLOCK.name());

    File inFile = inFiles[0];
    try (FileInputStream fin = new FileInputStream(inFile) ){
        controller.enqueue(fin);
    }
    controller.run();

    List<MockFlowFile> successSeqFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_SUCCESS);
    List<MockFlowFile> failedFlowFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_FAILURE);

    assertEquals(0, failedFlowFiles.size());
    assertEquals(1, successSeqFiles.size());

    MockFlowFile ff = successSeqFiles.iterator().next();
    byte[] data = ff.toByteArray();


    final String magicHeader = new String(data, 0, 3, "UTF-8");
    assertEquals("SEQ", magicHeader);
    // Format of header is SEQ followed by the version (1 byte).
    // Then, the length of the Key type (1 byte), then the Key type
    // Then, the length of the Value type(1 byte), then the Value type
    final String keyType = Text.class.getCanonicalName();
    final int valueTypeStart = 3 + 1 + 1 + keyType.length() + 1;
    final int valueTypeLength = data[5 + keyType.length()];
    final String valueType = BytesWritable.class.getCanonicalName();

    assertEquals(valueType.length(), valueTypeLength);
    assertEquals(valueType, new String(data, valueTypeStart, valueType.length(), "UTF-8"));

    final int compressionIndex = 3 + 1 + 1 + keyType.length() + 1 + valueType.length();
    final int blockCompressionIndex = compressionIndex + 1;

    assertEquals(1, data[compressionIndex]);
    assertEquals(1, data[blockCompressionIndex]);

    final int codecTypeSize = data[blockCompressionIndex + 1];
    final int codecTypeStartIndex = blockCompressionIndex + 2;

    assertEquals(BZip2Codec.class.getCanonicalName(), new String(data, codecTypeStartIndex, codecTypeSize, "UTF-8"));
}