org.apache.hadoop.io.compress.CompressionCodec Java Examples

The following examples show how to use org.apache.hadoop.io.compress.CompressionCodec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Compression.java    From big-c with Apache License 2.0 6 votes vote down vote up
public Compressor getCompressor() throws IOException {
  CompressionCodec codec = getCodec();
  if (codec != null) {
    Compressor compressor = CodecPool.getCompressor(codec);
    if (compressor != null) {
      if (compressor.finished()) {
        // Somebody returns the compressor to CodecPool but is still using
        // it.
        LOG.warn("Compressor obtained from CodecPool already finished()");
      } else {
        if(LOG.isDebugEnabled()) {
          LOG.debug("Got a compressor: " + compressor.hashCode());
        }
      }
      /**
       * Following statement is necessary to get around bugs in 0.18 where a
       * compressor is referenced after returned back to the codec pool.
       */
      compressor.reset();
    }
    return compressor;
  }
  return null;
}
 
Example #2
Source File: ConfigUtils.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
public static Class<? extends CompressionCodec> getIntermediateOutputCompressorClass(
    Configuration conf, Class<DefaultCodec> defaultValue) {
  Class<? extends CompressionCodec> codecClass = defaultValue;
  String name = conf
      .get(TezJobConfig.TEZ_RUNTIME_COMPRESS_CODEC);
  if (name != null) {
    try {
      codecClass = conf.getClassByName(name).asSubclass(
          CompressionCodec.class);
    } catch (ClassNotFoundException e) {
      throw new IllegalArgumentException("Compression codec " + name
          + " was not found.", e);
    }
  }
  return codecClass;
}
 
Example #3
Source File: TestDFSIO.java    From big-c with Apache License 2.0 6 votes vote down vote up
@Override // Mapper
public void configure(JobConf conf) {
  super.configure(conf);

  // grab compression
  String compression = getConf().get("test.io.compression.class", null);
  Class<? extends CompressionCodec> codec;

  // try to initialize codec
  try {
    codec = (compression == null) ? null : 
      Class.forName(compression).asSubclass(CompressionCodec.class);
  } catch(Exception e) {
    throw new RuntimeException("Compression codec not found: ", e);
  }

  if(codec != null) {
    compressionCodec = (CompressionCodec)
        ReflectionUtils.newInstance(codec, getConf());
  }
}
 
Example #4
Source File: FileOutputFormat.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Get the {@link CompressionCodec} for compressing the job outputs.
 * @param job the {@link Job} to look in
 * @param defaultValue the {@link CompressionCodec} to return if not set
 * @return the {@link CompressionCodec} to be used to compress the 
 *         job outputs
 * @throws IllegalArgumentException if the class was specified, but not found
 */
public static Class<? extends CompressionCodec> 
getOutputCompressorClass(JobContext job, 
                       Class<? extends CompressionCodec> defaultValue) {
  Class<? extends CompressionCodec> codecClass = defaultValue;
  Configuration conf = job.getConfiguration();
  String name = conf.get("mapred.output.compression.codec");
  if (name != null) {
    try {
      codecClass = 
      	conf.getClassByName(name).asSubclass(CompressionCodec.class);
    } catch (ClassNotFoundException e) {
      throw new IllegalArgumentException("Compression codec " + name + 
                                         " was not found.", e);
    }
  }
  return codecClass;
}
 
Example #5
Source File: BitcoinFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readBitcoinRawBlockInputFormatGzipCompressed() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec gzip = new GzipCodec();
   ReflectionUtils.setConf(gzip, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.gz";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }
 
Example #6
Source File: TezMerger.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
public static <K extends Object, V extends Object>
  TezRawKeyValueIterator merge(Configuration conf, FileSystem fs,
                          Class keyClass, Class valueClass,
                          CompressionCodec codec,
                          List<Segment> segments,
                          int mergeFactor, int inMemSegments, Path tmpDir,
                          RawComparator comparator, Progressable reporter,
                          boolean sortSegments,
                          TezCounter readsCounter,
                          TezCounter writesCounter,
                          TezCounter bytesReadCounter,
                          Progress mergePhase)
    throws IOException {
  return new MergeQueue(conf, fs, segments, comparator, reporter,
                         sortSegments, codec, false).merge(keyClass, valueClass,
                                             mergeFactor, inMemSegments,
                                             tmpDir,
                                             readsCounter, writesCounter,
                                             bytesReadCounter,
                                             mergePhase);
}
 
Example #7
Source File: Merger.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public static <K extends Object, V extends Object>
RawKeyValueIterator merge(Configuration conf, FileSystem fs,
                          Class<K> keyClass, Class<V> valueClass, 
                          CompressionCodec codec,
                          Path[] inputs, boolean deleteInputs, 
                          int mergeFactor, Path tmpDir,
                          RawComparator<K> comparator, Progressable reporter,
                          Counters.Counter readsCounter,
                          Counters.Counter writesCounter,
                          Progress mergePhase)
throws IOException {
  return 
    new MergeQueue<K, V>(conf, fs, inputs, deleteInputs, codec, comparator, 
                         reporter, null,
                         TaskType.REDUCE).merge(keyClass, valueClass,
                                         mergeFactor, tmpDir,
                                         readsCounter, writesCounter, 
                                         mergePhase);
}
 
Example #8
Source File: FileOutputFormat.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Get the {@link CompressionCodec} for compressing the job outputs.
 * @param job the {@link Job} to look in
 * @param defaultValue the {@link CompressionCodec} to return if not set
 * @return the {@link CompressionCodec} to be used to compress the 
 *         job outputs
 * @throws IllegalArgumentException if the class was specified, but not found
 */
public static Class<? extends CompressionCodec> 
getOutputCompressorClass(JobContext job, 
                       Class<? extends CompressionCodec> defaultValue) {
  Class<? extends CompressionCodec> codecClass = defaultValue;
  Configuration conf = job.getConfiguration();
  String name = conf.get(FileOutputFormat.COMPRESS_CODEC);
  if (name != null) {
    try {
      codecClass = 
      	conf.getClassByName(name).asSubclass(CompressionCodec.class);
    } catch (ClassNotFoundException e) {
      throw new IllegalArgumentException("Compression codec " + name + 
                                         " was not found.", e);
    }
  }
  return codecClass;
}
 
Example #9
Source File: CompressionEmulationUtil.java    From hadoop with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link OutputStream} for a file that might need 
 * compression.
 */
static OutputStream getPossiblyCompressedOutputStream(Path file, 
                                                      Configuration conf)
throws IOException {
  FileSystem fs = file.getFileSystem(conf);
  JobConf jConf = new JobConf(conf);
  if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
    // get the codec class
    Class<? extends CompressionCodec> codecClass =
      org.apache.hadoop.mapred.FileOutputFormat
                              .getOutputCompressorClass(jConf, 
                                                        GzipCodec.class);
    // get the codec implementation
    CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);

    // add the appropriate extension
    file = file.suffix(codec.getDefaultExtension());

    if (isCompressionEmulationEnabled(conf)) {
      FSDataOutputStream fileOut = fs.create(file, false);
      return new DataOutputStream(codec.createOutputStream(fileOut));
    }
  }
  return fs.create(file, false);
}
 
Example #10
Source File: InMemoryMapOutput.java    From big-c with Apache License 2.0 6 votes vote down vote up
public InMemoryMapOutput(Configuration conf, TaskAttemptID mapId,
                         MergeManagerImpl<K, V> merger,
                         int size, CompressionCodec codec,
                         boolean primaryMapOutput) {
  super(mapId, (long)size, primaryMapOutput);
  this.conf = conf;
  this.merger = merger;
  this.codec = codec;
  byteStream = new BoundedByteArrayOutputStream(size);
  memory = byteStream.getBuffer();
  if (codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
  } else {
    decompressor = null;
  }
}
 
Example #11
Source File: Merger.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public static <K extends Object, V extends Object>
RawKeyValueIterator merge(Configuration conf, FileSystem fs,
                          Class<K> keyClass, Class<V> valueClass,
                          CompressionCodec codec,
                          List<Segment<K, V>> segments,
                          int mergeFactor, Path tmpDir,
                          RawComparator<K> comparator, Progressable reporter,
                          boolean sortSegments,
                          Counters.Counter readsCounter,
                          Counters.Counter writesCounter,
                          Progress mergePhase,
                          TaskType taskType)
    throws IOException {
  return new MergeQueue<K, V>(conf, fs, segments, comparator, reporter,
                         sortSegments, codec,
                         taskType).merge(keyClass, valueClass,
                                             mergeFactor, tmpDir,
                                             readsCounter, writesCounter,
                                             mergePhase);
}
 
Example #12
Source File: SequenceFile.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/** Create the named file with write-progress reporter. */
public BlockCompressWriter(FileSystem fs, Configuration conf, Path name,
                           Class keyClass, Class valClass,
                           int bufferSize, short replication, long blockSize,
                           CompressionCodec codec,
                           Progressable progress, Metadata metadata)
  throws IOException {
  super.init(name, conf,
             fs.create(name, true, bufferSize, replication, blockSize, progress),
             keyClass, valClass, true, codec, metadata);
  init(conf.getInt("io.seqfile.compress.blocksize", 1000000));

  initializeFileHeader();
  writeFileHeader();
  finalizeFileHeader();
}
 
Example #13
Source File: SequenceFile.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Construct the preferred type of 'raw' SequenceFile Writer.
 * @param out The stream on top which the writer is to be constructed.
 * @param keyClass The 'key' type.
 * @param valClass The 'value' type.
 * @param compress Compress data?
 * @param blockCompress Compress blocks?
 * @param metadata The metadata of the file.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
private static Writer
  createWriter(Configuration conf, FSDataOutputStream out, 
               Class keyClass, Class valClass, boolean compress, boolean blockCompress,
               CompressionCodec codec, Metadata metadata)
  throws IOException {
  if (codec != null && (codec instanceof GzipCodec) && 
      !NativeCodeLoader.isNativeCodeLoaded() && 
      !ZlibFactory.isNativeZlibLoaded(conf)) {
    throw new IllegalArgumentException("SequenceFile doesn't work with " +
                                       "GzipCodec without native-hadoop code!");
  }

  Writer writer = null;

  if (!compress) {
    writer = new Writer(conf, out, keyClass, valClass, metadata);
  } else if (compress && !blockCompress) {
    writer = new RecordCompressWriter(conf, out, keyClass, valClass, codec, metadata);
  } else {
    writer = new BlockCompressWriter(conf, out, keyClass, valClass, codec, metadata);
  }
  
  return writer;
}
 
Example #14
Source File: BitcoinFormatHadoopTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
@Test
 public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }
 
Example #15
Source File: TestIFile.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
private Writer writeTestFile(IFile.Writer writer, boolean rle, boolean repeatKeys,
    List<KVPair> data, CompressionCodec codec) throws IOException {
  assertNotNull(writer);

  Text previousKey = null;
  for (KVPair kvp : data) {
    if (repeatKeys && (previousKey != null && previousKey.compareTo(kvp.getKey()) == 0)) {
      //RLE is enabled in IFile when IFile.REPEAT_KEY is set
      writer.append(IFile.REPEAT_KEY, kvp.getvalue());
    } else {
      writer.append(kvp.getKey(), kvp.getvalue());
    }
    previousKey = kvp.getKey();
  }

  writer.close();

  LOG.info("Uncompressed: " + writer.getRawLength());
  LOG.info("CompressedSize: " + writer.getCompressedLength());

  return writer;
}
 
Example #16
Source File: Compression.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
public Compressor getCompressor() throws IOException {
  CompressionCodec codec = getCodec();
  if (codec != null) {
    Compressor compressor = CodecPool.getCompressor(codec);
    if (compressor != null) {
      if (compressor.finished()) {
        // Somebody returns the compressor to CodecPool but is still using
        // it.
        LOG.warn("Compressor obtained from CodecPool already finished()");
      } else {
        LOG.debug("Got a compressor: " + compressor.hashCode());
      }
      /**
       * Following statement is necessary to get around bugs in 0.18 where a
       * compressor is referenced after returned back to the codec pool.
       */
      compressor.reset();
    }
    return compressor;
  }
  return null;
}
 
Example #17
Source File: TestInsertQuery.java    From tajo with Apache License 2.0 6 votes vote down vote up
@Test
public final void testInsertOverwriteWithCompression() throws Exception {
  String tableName = IdentifierUtil.normalizeIdentifier("testInsertOverwriteWithCompression");
  ResultSet res = executeFile("testInsertOverwriteWithCompression_ddl.sql");
  res.close();

  CatalogService catalog = testingCluster.getMaster().getCatalog();
  assertTrue(catalog.existsTable(getCurrentDatabase(), tableName));

  res = executeQuery();
  res.close();
  TableDesc desc = catalog.getTableDesc(getCurrentDatabase(), tableName);
  if (!testingCluster.isHiveCatalogStoreRunning()) {
    assertEquals(2, desc.getStats().getNumRows().intValue());
  }

  FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
  assertTrue(fs.exists(new Path(desc.getUri())));
  CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());

  for (FileStatus file : fs.listStatus(new Path(desc.getUri()))) {
    CompressionCodec codec = factory.getCodec(file.getPath());
    assertTrue(codec instanceof DeflateCodec);
  }
  executeString("DROP TABLE " + tableName + " PURGE");
}
 
Example #18
Source File: Compression.java    From hadoop with Apache License 2.0 6 votes vote down vote up
@Override
public synchronized boolean isSupported() {
  if (!checked) {
    checked = true;
    String extClazz =
        (conf.get(CONF_LZO_CLASS) == null ? System
            .getProperty(CONF_LZO_CLASS) : null);
    String clazz = (extClazz != null) ? extClazz : defaultClazz;
    try {
      LOG.info("Trying to load Lzo codec class: " + clazz);
      codec =
          (CompressionCodec) ReflectionUtils.newInstance(Class
              .forName(clazz), conf);
    } catch (ClassNotFoundException e) {
      // that is okay
    }
  }
  return codec != null;
}
 
Example #19
Source File: TestSequenceFile.java    From RDFS with Apache License 2.0 6 votes vote down vote up
private static void writeTest(FileSystem fs, int count, int seed, Path file, 
                              CompressionType compressionType, CompressionCodec codec)
  throws IOException {
  fs.delete(file, true);
  LOG.info("creating " + count + " records with " + compressionType +
           " compression");
  SequenceFile.Writer writer = 
    SequenceFile.createWriter(fs, conf, file, 
                              RandomDatum.class, RandomDatum.class, compressionType, codec);
  RandomDatum.Generator generator = new RandomDatum.Generator(seed);
  for (int i = 0; i < count; i++) {
    generator.next();
    RandomDatum key = generator.getKey();
    RandomDatum value = generator.getValue();

    writer.append(key, value);
  }
  writer.close();
}
 
Example #20
Source File: SequenceFileStoreFunc.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
@Override
public void setStoreLocation(String location, Job job)
    throws IOException {
  job.setOutputKeyClass(keyClass);
  job.setOutputValueClass(valueClass);
  if (compressionType != null && compressionCodecClass != null) {
    Class<? extends CompressionCodec> codecClass =
        FileOutputFormat.getOutputCompressorClass(job,
            DefaultCodec.class);
    SequenceFileOutputFormat.
        setOutputCompressorClass(job, codecClass);
    SequenceFileOutputFormat.setOutputCompressionType(job,
        SequenceFile.CompressionType.valueOf(compressionType));
  }
  FileOutputFormat.setOutputPath(job, new Path(location));
}
 
Example #21
Source File: SequenceFile.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Clones the attributes (like compression of the input file and creates a 
 * corresponding Writer
 * @param inputFile the path of the input file whose attributes should be 
 * cloned
 * @param outputFile the path of the output file 
 * @param prog the Progressable to report status during the file write
 * @return Writer
 * @throws IOException
 */
public Writer cloneFileAttributes(Path inputFile, Path outputFile, 
                                  Progressable prog) 
throws IOException {
  FileSystem srcFileSys = inputFile.getFileSystem(conf);
  Reader reader = new Reader(srcFileSys, inputFile, 4096, conf, true);
  boolean compress = reader.isCompressed();
  boolean blockCompress = reader.isBlockCompressed();
  CompressionCodec codec = reader.getCompressionCodec();
  reader.close();

  Writer writer = createWriter(outputFile.getFileSystem(conf), conf, 
                               outputFile, keyClass, valClass, compress, 
                               blockCompress, codec, prog,
                               new Metadata());
  return writer;
}
 
Example #22
Source File: IFile.java    From tez with Apache License 2.0 6 votes vote down vote up
public Writer(Configuration conf, FSDataOutputStream outputStream,
    Class keyClass, Class valueClass,
    CompressionCodec codec, TezCounter writesCounter, TezCounter serializedBytesCounter,
    boolean rle) throws IOException {
  this.rawOut = outputStream;
  this.writtenRecordsCounter = writesCounter;
  this.serializedUncompressedBytes = serializedBytesCounter;
  this.start = this.rawOut.getPos();
  this.rle = rle;

  setupOutputStream(codec);

  writeHeader(outputStream);

  if (keyClass != null) {
    this.closeSerializers = true;
    SerializationFactory serializationFactory =
      new SerializationFactory(conf);
    this.keySerializer = serializationFactory.getSerializer(keyClass);
    this.keySerializer.open(buffer);
    this.valueSerializer = serializationFactory.getSerializer(valueClass);
    this.valueSerializer.open(buffer);
  } else {
    this.closeSerializers = false;
  }
}
 
Example #23
Source File: WholeTextInputFormat.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (currentPath>=split.getNumPaths()) {
        return false;
    }

    Path path = split.getPath(currentPath);
    currentPath++;

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(path);
    key = path.toString();
    FSDataInputStream fileIn = fs.open(path);

    value = codec!=null?codec.createInputStream(fileIn):fileIn;
    return true;
}
 
Example #24
Source File: HDFSBadSeqWriter.java    From mt-flume with Apache License 2.0 5 votes vote down vote up
@Override
public void open(String filePath, CompressionCodec codeC,
    CompressionType compType) throws IOException {
  super.open(filePath, codeC, compType);
  if(closed) {
    opened = true;
  }
}
 
Example #25
Source File: TestCombineTextInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, conf);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 1", 1, splits.size());
  List<Text> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}
 
Example #26
Source File: CompressWriterFactoryTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private void validateResults(File folder, List<String> expected, CompressionCodec codec) throws Exception {
	File[] buckets = folder.listFiles();
	assertNotNull(buckets);
	assertEquals(1, buckets.length);

	final File[] partFiles = buckets[0].listFiles();
	assertNotNull(partFiles);
	assertEquals(1, partFiles.length);

	for (File partFile : partFiles) {
		assertTrue(partFile.length() > 0);
		final List<String> fileContent = readFile(partFile, codec);
		assertEquals(expected, fileContent);
	}
}
 
Example #27
Source File: WARCOutputFormat.java    From flink-crawler with Apache License 2.0 5 votes vote down vote up
public WARCWriter(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    CompressionCodec codec = getCompressOutput(context) ? WARCFileWriter.getGzipCodec(conf)
            : null;
    Path workFile = getDefaultWorkFile(context, "");
    _writer = new WARCFileWriter(conf, codec, workFile);
}
 
Example #28
Source File: WarcFileRecordReader.java    From wikireverse with MIT License 5 votes vote down vote up
public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException {
  if (split instanceof FileSplit) {
    this.filePathList=new Path[1];
    this.filePathList[0]=((FileSplit)split).getPath();
  } else if (split instanceof MultiFileSplit) {
    this.filePathList=((MultiFileSplit)split).getPaths();
  } else {
    throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
  }

  // Use FileSystem.get to open Common Crawl URIs using the S3 protocol.
  URI uri = filePathList[0].toUri();
  this.fs = FileSystem.get(uri, conf);
  
  // get the total file sizes
  for (int i=0; i < filePathList.length; i++) {
    totalFileSize += fs.getFileStatus(filePathList[i]).getLen();
  }

  Class<? extends CompressionCodec> codecClass=null;

  try {
    codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class);
    compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
  } catch (ClassNotFoundException cnfEx) {
    compressionCodec=null;
    LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec");
  }

  openNextFile();
}
 
Example #29
Source File: TestDataSegment.java    From RDFS with Apache License 2.0 5 votes vote down vote up
void testWithCodec(Class<? extends CompressionCodec> codecClass) throws Exception {
  CompressionCodec codec =
    codecClass == null ? null : ReflectionUtils.newInstance(codecClass, conf);
  testNormalWriteAndRead(1, 1024, 16, codec);
  testNormalWriteAndRead(10, 1024, 16, codec);
  testNormalWriteAndRead(1, 1024, 256, codec);
  testNormalWriteAndRead(10, 1024, 256, codec);
}
 
Example #30
Source File: HiveColumnCardinalityUpdateJob.java    From Kylin with Apache License 2.0 5 votes vote down vote up
private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}