org.apache.hadoop.io.compress.GzipCodec Java Examples

The following examples show how to use org.apache.hadoop.io.compress.GzipCodec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BitcoinFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readBitcoinRawBlockInputFormatGzipCompressed() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec gzip = new GzipCodec();
   ReflectionUtils.setConf(gzip, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.gz";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }
 
Example #2
Source File: TextMultiOutputFormat.java    From XLearning with Apache License 2.0 6 votes vote down vote up
public RecordWriter<K, V> getRecordWriter(FileSystem ignored,
                                          JobConf job,
                                          String name,
                                          Progressable progress)
    throws IOException {
  boolean ignoreSeparatorOnNull = job.getBoolean("mapred.textoutputformat.ignore.separator", false);
  String keyValueSeparator = job.get("mapred.textoutputformat.separator", "\t");
  splitSize = job.getLong(MR_REDUCE_MAX_FILE_PER_FILE, SPLIT_SIZE);
  jobConf = job;
  fileName = name;
  jobProgress = progress;
  Class<? extends CompressionCodec> codecClass =
      getOutputCompressorClass(job, GzipCodec.class);
  // create the named codec
  codec = ReflectionUtils.newInstance(codecClass, job);
  FSDataOutputStream fileOut = createFile();

  return new MultiSplitRecordWriter<K, V>(new NewDataOutputStream(codec.createOutputStream(fileOut)),
      keyValueSeparator, ignoreSeparatorOnNull);

}
 
Example #3
Source File: FileReaderWriterFactoryTest.java    From secor with Apache License 2.0 6 votes vote down vote up
public void testSequenceFileReader() throws Exception {
    setupSequenceFileReaderConfig();
    mockSequenceFileWriter(false);
    ReflectionUtil.createFileReader(mConfig.getFileReaderWriterFactory(), mLogFilePath, null, mConfig);

    // Verify that the method has been called exactly once (the default).
    // PowerMockito.verifyStatic(FileSystem.class);
    // FileSystem.get(Mockito.any(URI.class), Mockito.any(Configuration.class));

    mockSequenceFileWriter(true);
    ReflectionUtil.createFileWriter(mConfig.getFileReaderWriterFactory(), mLogFilePathGz, new GzipCodec(),
            mConfig);

    // Verify that the method has been called exactly once (the default).
    // PowerMockito.verifyStatic(FileSystem.class);
    // FileSystem.get(Mockito.any(URI.class), Mockito.any(Configuration.class));
}
 
Example #4
Source File: TestDDLBuilder.java    From incubator-tajo with Apache License 2.0 6 votes vote down vote up
@Test
public void testBuildDDL() throws Exception {
  Schema schema = new Schema();
  schema.addColumn("name", TajoDataTypes.Type.BLOB);
  schema.addColumn("addr", TajoDataTypes.Type.TEXT);
  TableMeta meta = CatalogUtil.newTableMeta(CatalogProtos.StoreType.CSV);
  meta.putOption(CatalogConstants.CSVFILE_DELIMITER, CatalogConstants.CSVFILE_DELIMITER_DEFAULT);
  meta.putOption(CatalogConstants.COMPRESSION_CODEC, GzipCodec.class.getName());

  TableDesc desc = new TableDesc("table1", schema, meta, new Path("/table1"));

  Schema expressionSchema = new Schema();
  expressionSchema.addColumn("key", TajoDataTypes.Type.INT4);
  expressionSchema.addColumn("key2", TajoDataTypes.Type.TEXT);
  PartitionMethodDesc partitionMethod = new PartitionMethodDesc(
      "table1",
      CatalogProtos.PartitionType.COLUMN,
      "key,key2",
      expressionSchema);
  desc.setPartitionMethod(partitionMethod);

  assertEquals(FileUtil.readTextFile(new File("src/test/resources/results/testBuildDDL.result")),
      DDLBuilder.buildDDL(desc));
}
 
Example #5
Source File: TestQseqInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
@Test
public void testGzCompressedInput() throws IOException
{
	// write gzip-compressed data
	GzipCodec codec = new GzipCodec();
	PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) );
	qseqOut.write(twoQseq);
	qseqOut.close();

	// now try to read it
	split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null);
	QseqRecordReader reader = new QseqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());

	retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
}
 
Example #6
Source File: FileRegistryTest.java    From secor with Apache License 2.0 6 votes vote down vote up
private void createCompressedWriter() throws Exception {
    PowerMockito.mockStatic(FileUtil.class);

    PowerMockito.mockStatic(ReflectionUtil.class);
    FileWriter writer = Mockito.mock(FileWriter.class);
    Mockito.when(
            ReflectionUtil.createFileWriter(
                    Mockito.any(String.class),
                    Mockito.any(LogFilePath.class),
                    Mockito.any(CompressionCodec.class),
                    Mockito.any(SecorConfig.class)
            ))
            .thenReturn(writer);

    Mockito.when(writer.getLength()).thenReturn(123L);

    FileWriter createdWriter = mRegistry.getOrCreateWriter(
            mLogFilePathGz, new GzipCodec());
    assertTrue(createdWriter == writer);
}
 
Example #7
Source File: SequenceFile.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Construct the preferred type of 'raw' SequenceFile Writer.
 * @param conf The configuration.
 * @param out The stream on top which the writer is to be constructed.
 * @param keyClass The 'key' type.
 * @param valClass The 'value' type.
 * @param compressionType The compression type.
 * @param codec The compression codec.
 * @param metadata The metadata of the file.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer
  createWriter(Configuration conf, FSDataOutputStream out, 
               Class keyClass, Class valClass, CompressionType compressionType,
               CompressionCodec codec, Metadata metadata)
  throws IOException {
  if ((codec instanceof GzipCodec) && 
      !NativeCodeLoader.isNativeCodeLoaded() && 
      !ZlibFactory.isNativeZlibLoaded(conf)) {
    throw new IllegalArgumentException("SequenceFile doesn't work with " +
                                       "GzipCodec without native-hadoop code!");
  }

  Writer writer = null;

  if (compressionType == CompressionType.NONE) {
    writer = new Writer(conf, out, keyClass, valClass, metadata);
  } else if (compressionType == CompressionType.RECORD) {
    writer = new RecordCompressWriter(conf, out, keyClass, valClass, codec, metadata);
  } else if (compressionType == CompressionType.BLOCK){
    writer = new BlockCompressWriter(conf, out, keyClass, valClass, codec, metadata);
  }
  
  return writer;
}
 
Example #8
Source File: TestIFile.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@Test
/**
 * Create an IFile.Writer using GzipCodec since this code does not
 * have a compressor when run via the tests (ie no native libraries).
 */
public void testIFileWriterWithCodec() throws Exception {
  Configuration conf = new Configuration();
  FileSystem localFs = FileSystem.getLocal(conf);
  FileSystem rfs = ((LocalFileSystem)localFs).getRaw();
  Path path = new Path(new Path("build/test.ifile"), "data");
  DefaultCodec codec = new GzipCodec();
  codec.setConf(conf);
  IFile.Writer<Text, Text> writer =
    new IFile.Writer<Text, Text>(conf, rfs.create(path), Text.class, Text.class,
                                 codec, null);
  writer.close();
}
 
Example #9
Source File: TestIFile.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@Test
/** Same as above but create a reader. */
public void testIFileReaderWithCodec() throws Exception {
  Configuration conf = new Configuration();
  FileSystem localFs = FileSystem.getLocal(conf);
  FileSystem rfs = ((LocalFileSystem)localFs).getRaw();
  Path path = new Path(new Path("build/test.ifile"), "data");
  DefaultCodec codec = new GzipCodec();
  codec.setConf(conf);
  FSDataOutputStream out = rfs.create(path);
  IFile.Writer<Text, Text> writer =
      new IFile.Writer<Text, Text>(conf, out, Text.class, Text.class,
                                   codec, null);
  writer.close();
  FSDataInputStream in = rfs.open(path);
  IFile.Reader<Text, Text> reader =
    new IFile.Reader<Text, Text>(conf, in, rfs.getFileStatus(path).getLen(),
        codec, null);
  reader.close();
  
  // test check sum 
  byte[] ab= new byte[100];
  int readed= reader.checksumIn.readWithChecksum(ab, 0, ab.length);
  assertEquals( readed,reader.checksumIn.getChecksum().length);
  
}
 
Example #10
Source File: TestCellBlockBuilder.java    From hbase with Apache License 2.0 6 votes vote down vote up
/**
 * For running a few tests of methods herein.
 *
 * @param args the arguments to use for the timer test
 * @throws IOException if creating the build fails
 */
public static void main(String[] args) throws IOException {
  int count = 1024;
  int size = 10240;
  for (String arg : args) {
    if (arg.startsWith(COUNT)) {
      count = Integer.parseInt(arg.replace(COUNT, ""));
    } else if (arg.startsWith(SIZE)) {
      size = Integer.parseInt(arg.replace(SIZE, ""));
    } else {
      usage(1);
    }
  }
  CellBlockBuilder builder = new CellBlockBuilder(HBaseConfiguration.create());
  timerTests(builder, count, size, new KeyValueCodec(), null);
  timerTests(builder, count, size, new KeyValueCodec(), new DefaultCodec());
  timerTests(builder, count, size, new KeyValueCodec(), new GzipCodec());
}
 
Example #11
Source File: CompressionEmulationUtil.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link OutputStream} for a file that might need 
 * compression.
 */
static OutputStream getPossiblyCompressedOutputStream(Path file, 
                                                      Configuration conf)
throws IOException {
  FileSystem fs = file.getFileSystem(conf);
  JobConf jConf = new JobConf(conf);
  if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
    // get the codec class
    Class<? extends CompressionCodec> codecClass =
      org.apache.hadoop.mapred.FileOutputFormat
                              .getOutputCompressorClass(jConf, 
                                                        GzipCodec.class);
    // get the codec implementation
    CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

    // add the appropriate extension
    file = file.suffix(codec.getDefaultExtension());

    if (isCompressionEmulationEnabled(conf)) {
      FSDataOutputStream fileOut = fs.create(file, false);
      return new DataOutputStream(codec.createOutputStream(fileOut));
    }
  }
  return fs.create(file, false);
}
 
Example #12
Source File: SequenceFile.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Construct the preferred type of 'raw' SequenceFile Writer.
 * @param out The stream on top which the writer is to be constructed.
 * @param keyClass The 'key' type.
 * @param valClass The 'value' type.
 * @param compress Compress data?
 * @param blockCompress Compress blocks?
 * @param metadata The metadata of the file.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
private static Writer
  createWriter(Configuration conf, FSDataOutputStream out, 
               Class keyClass, Class valClass, boolean compress, boolean blockCompress,
               CompressionCodec codec, Metadata metadata)
  throws IOException {
  if (codec != null && (codec instanceof GzipCodec) && 
      !NativeCodeLoader.isNativeCodeLoaded() && 
      !ZlibFactory.isNativeZlibLoaded(conf)) {
    throw new IllegalArgumentException("SequenceFile doesn't work with " +
                                       "GzipCodec without native-hadoop code!");
  }

  Writer writer = null;

  if (!compress) {
    writer = new Writer(conf, out, keyClass, valClass, metadata);
  } else if (compress && !blockCompress) {
    writer = new RecordCompressWriter(conf, out, keyClass, valClass, codec, metadata);
  } else {
    writer = new BlockCompressWriter(conf, out, keyClass, valClass, codec, metadata);
  }
  
  return writer;
}
 
Example #13
Source File: JSONFileOutputFormat.java    From ojai with Apache License 2.0 6 votes vote down vote up
@Override
public RecordWriter<LongWritable, Document> getRecordWriter(
    TaskAttemptContext job) throws IOException, InterruptedException {

  Configuration conf = job.getConfiguration();
  boolean isCompressed = getCompressOutput(job);
  CompressionCodec codec = null;
  String extension = "";
  if (isCompressed) {
    Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(
        job, GzipCodec.class);
    codec = ReflectionUtils.newInstance(codecClass, conf);
    extension = codec.getDefaultExtension();
  }
  Path path = getDefaultWorkFile(job, extension);
  FileSystem fs = path.getFileSystem(conf);
  FSDataOutputStream out = fs.create(path, false);
  if (!isCompressed) {
    return new JSONFileOutputRecordWriter(out);
  } else {
    return new JSONFileOutputRecordWriter(new DataOutputStream(
        codec.createOutputStream(out)));
  }
}
 
Example #14
Source File: TestIFile.java    From big-c with Apache License 2.0 6 votes vote down vote up
@Test
/**
 * Create an IFile.Writer using GzipCodec since this code does not
 * have a compressor when run via the tests (ie no native libraries).
 */
public void testIFileWriterWithCodec() throws Exception {
  Configuration conf = new Configuration();
  FileSystem localFs = FileSystem.getLocal(conf);
  FileSystem rfs = ((LocalFileSystem)localFs).getRaw();
  Path path = new Path(new Path("build/test.ifile"), "data");
  DefaultCodec codec = new GzipCodec();
  codec.setConf(conf);
  IFile.Writer<Text, Text> writer =
    new IFile.Writer<Text, Text>(conf, rfs.create(path), Text.class, Text.class,
                                 codec, null);
  writer.close();
}
 
Example #15
Source File: TestIFile.java    From big-c with Apache License 2.0 6 votes vote down vote up
@Test
/** Same as above but create a reader. */
public void testIFileReaderWithCodec() throws Exception {
  Configuration conf = new Configuration();
  FileSystem localFs = FileSystem.getLocal(conf);
  FileSystem rfs = ((LocalFileSystem)localFs).getRaw();
  Path path = new Path(new Path("build/test.ifile"), "data");
  DefaultCodec codec = new GzipCodec();
  codec.setConf(conf);
  FSDataOutputStream out = rfs.create(path);
  IFile.Writer<Text, Text> writer =
      new IFile.Writer<Text, Text>(conf, out, Text.class, Text.class,
                                   codec, null);
  writer.close();
  FSDataInputStream in = rfs.open(path);
  IFile.Reader<Text, Text> reader =
    new IFile.Reader<Text, Text>(conf, in, rfs.getFileStatus(path).getLen(),
        codec, null);
  reader.close();
  
  // test check sum 
  byte[] ab= new byte[100];
  int readed= reader.checksumIn.readWithChecksum(ab, 0, ab.length);
  assertEquals( readed,reader.checksumIn.getChecksum().length);
  
}
 
Example #16
Source File: CompressionEmulationUtil.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link OutputStream} for a file that might need 
 * compression.
 */
static OutputStream getPossiblyCompressedOutputStream(Path file, 
                                                      Configuration conf)
throws IOException {
  FileSystem fs = file.getFileSystem(conf);
  JobConf jConf = new JobConf(conf);
  if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
    // get the codec class
    Class<? extends CompressionCodec> codecClass =
      org.apache.hadoop.mapred.FileOutputFormat
                              .getOutputCompressorClass(jConf, 
                                                        GzipCodec.class);
    // get the codec implementation
    CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

    // add the appropriate extension
    file = file.suffix(codec.getDefaultExtension());

    if (isCompressionEmulationEnabled(conf)) {
      FSDataOutputStream fileOut = fs.create(file, false);
      return new DataOutputStream(codec.createOutputStream(fileOut));
    }
  }
  return fs.create(file, false);
}
 
Example #17
Source File: BitcoinFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readBitcoinTransactionInputFormatGzipCompressed() throws IOException, InterruptedException{
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec gzip = new GzipCodec();
   ReflectionUtils.setConf(gzip, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.gz";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinTransaction> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
int transactCount=0;
while (reader.nextKeyValue()) {
	transactCount++;
}
	assertEquals( 936, transactCount,"Comrpessed block must have at least 936 transactions");
reader.close();
 }
 
Example #18
Source File: BitcoinFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readBitcoinRawBlockInputFormatGzipCompressed() throws IOException {
   JobConf job = new JobConf(defaultConf);
   CompressionCodec gzip = new GzipCodec();
   ReflectionUtils.setConf(gzip, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.gz";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.next(key,block),"Input Split for block version contains at least one block");
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
BytesWritable emptyKey = new BytesWritable();
   	BytesWritable emptyBlock = new BytesWritable();
   	assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block");
reader.close();
 }
 
Example #19
Source File: BitcoinFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
  public void readBitcoinTransactionInputFormatGzipCompressed() throws IOException{
     JobConf job = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, job);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName="version4comp.blk.gz";
    String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
    Path file = new Path(fileNameBlock);
    FileInputFormat.setInputPaths(job, file);
    BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job,1);
    assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
    	RecordReader<BytesWritable, BitcoinTransaction> reader = format.getRecordReader(inputSplits[0], job, reporter);
	assertNotNull( reader,"Format returned  null RecordReader");
	BytesWritable key = new BytesWritable();	
	BitcoinTransaction transaction = new BitcoinTransaction();
	int transactCount=0;
	while (reader.next(key,transaction)) {
		transactCount++;
	}
 	assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions");
	reader.close();  
}
 
Example #20
Source File: ConfigurationHelper.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
/**
 * Job configurator
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass,
        Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass,
        String commaSeparatedInputFiles, String outputPath)
        throws IOException
{
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}
 
Example #21
Source File: FileRegistryTest.java    From secor with Apache License 2.0 5 votes vote down vote up
public void testGetOrCreateWriterCompressed() throws Exception {
    createCompressedWriter();

    mRegistry.getOrCreateWriter(mLogFilePathGz, new GzipCodec());

    // Verify that the method has been called exactly once (the default).
    PowerMockito.verifyStatic(FileUtil.class);
    FileUtil.delete(PATH_GZ);
    PowerMockito.verifyStatic(FileUtil.class);
    FileUtil.delete(CRC_PATH);

    PowerMockito.verifyStatic(ReflectionUtil.class);
    ReflectionUtil.createFileWriter(Mockito.any(String.class),
            Mockito.any(LogFilePath.class),
            Mockito.any(CompressionCodec.class),
            Mockito.any(SecorConfig.class)
    );

    TopicPartition topicPartition = new TopicPartition("some_topic", 0);
    Collection<TopicPartition> topicPartitions = mRegistry
            .getTopicPartitions();
    assertEquals(1, topicPartitions.size());
    assertTrue(topicPartitions.contains(topicPartition));

    Collection<LogFilePath> logFilePaths = mRegistry
            .getPaths(topicPartition);
    assertEquals(1, logFilePaths.size());
    assertTrue(logFilePaths.contains(mLogFilePath));
}
 
Example #22
Source File: SequenceFile.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/**
   * Construct the preferred type of 'raw' SequenceFile Writer.
   * @param fs The configured filesystem. 
   * @param conf The configuration.
   * @param file The name of the file. 
   * @param keyClass The 'key' type.
   * @param valClass The 'value' type.
   * @param compress Compress data?
   * @param blockCompress Compress blocks?
   * @param codec The compression codec.
   * @param progress
   * @param metadata The metadata of the file.
   * @return Returns the handle to the constructed SequenceFile Writer.
   * @throws IOException
   */
  private static Writer
  createWriter(FileSystem fs, Configuration conf, Path file, 
               Class keyClass, Class valClass, 
               boolean compress, boolean blockCompress,
               CompressionCodec codec, Progressable progress, Metadata metadata)
  throws IOException {
  if (codec != null && (codec instanceof GzipCodec) && 
      !NativeCodeLoader.isNativeCodeLoaded() && 
      !ZlibFactory.isNativeZlibLoaded(conf)) {
    throw new IllegalArgumentException("SequenceFile doesn't work with " +
                                       "GzipCodec without native-hadoop code!");
  }

  Writer writer = null;

  if (!compress) {
    writer = new Writer(fs, conf, file, keyClass, valClass, progress, metadata);
  } else if (compress && !blockCompress) {
    writer = new RecordCompressWriter(fs, conf, file, keyClass, valClass, 
                                      codec, progress, metadata);
  } else {
    writer = new BlockCompressWriter(fs, conf, file, keyClass, valClass, 
                                     codec, progress, metadata);
  }
  
  return writer;
}
 
Example #23
Source File: HiveRCOutputFormat.java    From spork with Apache License 2.0 5 votes vote down vote up
protected RCFile.Writer createRCFileWriter(TaskAttemptContext job,
                                           Text columnMetadata)
                                           throws IOException {
  Configuration conf = job.getConfiguration();

  // override compression codec if set.
  String codecOverride = conf.get(COMPRESSION_CODEC_CONF);
  if (codecOverride != null) {
    conf.setBoolean(MRConfiguration.OUTPUT_COMPRESS, true);
    conf.set(MRConfiguration.OUTPUT_COMPRESSION_CODEC, codecOverride);
  }

  CompressionCodec codec = null;
  if (getCompressOutput(job)) {
    Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
    codec = ReflectionUtils.newInstance(codecClass, conf);
  }

  Metadata metadata = null;

  String ext = conf.get(EXTENSION_OVERRIDE_CONF, DEFAULT_EXTENSION);
  Path file = getDefaultWorkFile(job, ext.equalsIgnoreCase("none") ? null : ext);

  LOG.info("writing to rcfile " + file.toString());

  return new RCFile.Writer(file.getFileSystem(conf), conf, file, job, metadata, codec);
}
 
Example #24
Source File: ExportManifestOutputFormat.java    From emr-dynamodb-connector with Apache License 2.0 5 votes vote down vote up
@Override
public RecordWriter<K, Text> getRecordWriter(FileSystem ignored, JobConf job, String name,
    Progressable progress) throws IOException {
  String extension = "";
  Path file = FileOutputFormat.getTaskOutputPath(job, MANIFEST_FILENAME);
  FileSystem fs = file.getFileSystem(job);
  FSDataOutputStream fileOut = fs.create(file, progress);
  if (getCompressOutput(job)) {
    Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
    CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
    extension = codec.getDefaultExtension();
  }
  return new ExportManifestRecordWriter<>(fileOut, FileOutputFormat.getOutputPath(job),
      extension);
}
 
Example #25
Source File: EmoFileSystem.java    From emodb with Apache License 2.0 5 votes vote down vote up
private EmoSplitInputStream(String table, String split)
        throws IOException {
    if (isEmptySplit(split)) {
        _rows = Iterators.emptyIterator();
    } else {
        // Get the DataStore and begin streaming the split's rows.
        CloseableDataStore dataStore = HadoopDataStoreManager.getInstance().getDataStore(_uri, _apiKey, _metricRegistry);
        _closer.register(dataStore);

        _rows = DataStoreStreaming.getSplit(dataStore, table, split, false, ReadConsistency.STRONG).iterator();
    }

    _buffer.clear();
    _buffer.limit(0);
    GzipCodec gzipCodec = new GzipCodec();
    gzipCodec.setConf(new Configuration());

    // Set up the pipes
    PipedOutputStream pipeRawToGzip = new PipedOutputStream();
    _gzipIn = new PipedInputStream(pipeRawToGzip, 10 * 1024 * 1024);
    _rawOut = gzipCodec.createOutputStream(pipeRawToGzip);
    _closer.register(_gzipIn);
    _closer.register(pipeRawToGzip);

    // Start the asynchronous buffering thread
    _bufferThread = new Thread(new Runnable() {
        @Override
        public void run() {
            streamAndCompressInput();
        }
    });
    _bufferThread.start();
}
 
Example #26
Source File: FileReaderWriterFactoryTest.java    From secor with Apache License 2.0 5 votes vote down vote up
private void mockDelimitedTextFileWriter(boolean isCompressed) throws Exception {
    PowerMockito.mockStatic(FileSystem.class);
    FileSystem fs = Mockito.mock(FileSystem.class);
    Mockito.when(
            FileSystem.get(Mockito.any(URI.class),
                    Mockito.any(Configuration.class))).thenReturn(fs);

    Path fsPath = (!isCompressed) ? new Path(PATH) : new Path(PATH_GZ);

    GzipCodec codec = PowerMockito.mock(GzipCodec.class);
    PowerMockito.whenNew(GzipCodec.class).withNoArguments()
            .thenReturn(codec);

    FSDataInputStream fileInputStream = Mockito
            .mock(FSDataInputStream.class);
    FSDataOutputStream fileOutputStream = Mockito
            .mock(FSDataOutputStream.class);

    Mockito.when(fs.open(fsPath)).thenReturn(fileInputStream);
    Mockito.when(fs.create(fsPath)).thenReturn(fileOutputStream);

    CompressionInputStream inputStream = Mockito
            .mock(CompressionInputStream.class);
    CompressionOutputStream outputStream = Mockito
            .mock(CompressionOutputStream.class);
    Mockito.when(codec.createInputStream(Mockito.any(InputStream.class)))
            .thenReturn(inputStream);

    Mockito.when(codec.createOutputStream(Mockito.any(OutputStream.class)))
            .thenReturn(outputStream);
}
 
Example #27
Source File: SequenceFile.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Construct the preferred type of SequenceFile Writer.
 * @param fs The configured filesystem.
 * @param conf The configuration.
 * @param name The name of the file.
 * @param keyClass The 'key' type.
 * @param valClass The 'value' type.
 * @param bufferSize buffer size for the underlaying outputstream.
 * @param replication replication factor for the file.
 * @param blockSize block size for the file.
 * @param compressionType The compression type.
 * @param codec The compression codec.
 * @param progress The Progressable object to track progress.
 * @param metadata The metadata of the file.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer
  createWriter(FileSystem fs, Configuration conf, Path name,
               Class keyClass, Class valClass, int bufferSize,
               short replication, long blockSize,
               CompressionType compressionType, CompressionCodec codec,
               Progressable progress, Metadata metadata) throws IOException {
  if ((codec instanceof GzipCodec) &&
      !NativeCodeLoader.isNativeCodeLoaded() &&
      !ZlibFactory.isNativeZlibLoaded(conf)) {
    throw new IllegalArgumentException("SequenceFile doesn't work with " +
                                       "GzipCodec without native-hadoop code!");
  }

  Writer writer = null;

  if (compressionType == CompressionType.NONE) {
    writer = new Writer(fs, conf, name, keyClass, valClass,
                        bufferSize, replication, blockSize,
                        progress, metadata);
  } else if (compressionType == CompressionType.RECORD) {
    writer = new RecordCompressWriter(fs, conf, name, keyClass, valClass,
                                      bufferSize, replication, blockSize,
                                      codec, progress, metadata);
  } else if (compressionType == CompressionType.BLOCK){
    writer = new BlockCompressWriter(fs, conf, name, keyClass, valClass,
                                     bufferSize, replication, blockSize,
                                     codec, progress, metadata);
  }

  return writer;
}
 
Example #28
Source File: TestSimpleSeekableFormatStreams.java    From RDFS with Apache License 2.0 5 votes vote down vote up
/**
 * Test the seekForward function with truncated files.
 */
public void testTruncatedWriteAndForwardRead() throws Exception {
  testTruncatedWriteAndForwardRead(null, false);
  testTruncatedWriteAndForwardRead(GzipCodec.class, false);
  // useFileSystem = true, for testing seek using Seekable
  testTruncatedWriteAndForwardRead(null, true);
  testTruncatedWriteAndForwardRead(GzipCodec.class, true);
}
 
Example #29
Source File: PigStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private void setCompression(Path path, Job job) {
 	String location=path.getName();
    if (location.endsWith(".bz2") || location.endsWith(".bz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job,  BZip2Codec.class);
    }  else if (location.endsWith(".gz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {
        FileOutputFormat.setCompressOutput( job, false);
    }
}
 
Example #30
Source File: JsonORCFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
/**
 * Used for returning the compression kind used in ORC
 *
 * @param codec
 * @return
 */
private CompressionKind resolveCompression(CompressionCodec codec) {
    if (codec instanceof Lz4Codec)
        return CompressionKind.LZ4;
    else if (codec instanceof SnappyCodec)
        return CompressionKind.SNAPPY;
    // although GZip and ZLIB are not same thing
    // there is no better named codec for this case,
    // use hadoop Gzip codec to enable ORC ZLIB compression
    else if (codec instanceof GzipCodec)
        return CompressionKind.ZLIB;
    else
        return CompressionKind.NONE;
}