Java Code Examples for org.apache.hadoop.io.compress.CompressionCodec#createInputStream()

The following examples show how to use org.apache.hadoop.io.compress.CompressionCodec#createInputStream() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HadoopFileReader.java    From hadoopoffice with Apache License 2.0 6 votes vote down vote up
public InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=compressionCodecs.getCodec(path);
 	FSDataInputStream fileIn=fs.open(path);
	// check if compressed
	if (codec==null) { // uncompressed
	LOG.debug("Reading from an uncompressed file \""+path+"\"");
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			LOG.debug("Reading from a compressed file \""+path+"\" with splittable compression codec");
			long end = fs.getFileStatus(path).getLen(); 
        		return ((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
      		} else {
			LOG.debug("Reading from a compressed file \""+path+"\" with non-splittable compression codec");
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}
 
Example 2
Source File: IFile.java    From tez with Apache License 2.0 6 votes vote down vote up
private static InputStream getDecompressedInputStreamWithBufferSize(CompressionCodec codec,
    IFileInputStream checksumIn, Decompressor decompressor, int compressedLength)
    throws IOException {
  String bufferSizeProp = TezRuntimeUtils.getBufferSizeProperty(codec);

  if (bufferSizeProp != null) {
    Configurable configurableCodec = (Configurable) codec;
    Configuration conf = configurableCodec.getConf();

    int bufSize = Math.min(compressedLength, DEFAULT_BUFFER_SIZE);
    LOG.trace("buffer size was set according to min(compressedLength, {}): {}={}",
        DEFAULT_BUFFER_SIZE, bufferSizeProp, bufSize);
    conf.setInt(bufferSizeProp, bufSize);
  }

  return codec.createInputStream(checksumIn, decompressor);
}
 
Example 3
Source File: MapReduceBitcoinBlockIntegrationTest.java    From hadoopcryptoledger with Apache License 2.0 6 votes vote down vote up
private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}
 
Example 4
Source File: CellBlockBuilder.java    From hbase with Apache License 2.0 6 votes vote down vote up
private ByteBuffer decompress(CompressionCodec compressor, InputStream cellBlockStream,
    int osInitialSize) throws IOException {
  // GZIPCodec fails w/ NPE if no configuration.
  if (compressor instanceof Configurable) {
    ((Configurable) compressor).setConf(this.conf);
  }
  Decompressor poolDecompressor = CodecPool.getDecompressor(compressor);
  CompressionInputStream cis = compressor.createInputStream(cellBlockStream, poolDecompressor);
  ByteBufferOutputStream bbos;
  try {
    // TODO: This is ugly. The buffer will be resized on us if we guess wrong.
    // TODO: Reuse buffers.
    bbos = new ByteBufferOutputStream(osInitialSize);
    IOUtils.copy(cis, bbos);
    bbos.close();
    return bbos.getByteBuffer();
  } finally {
    CodecPool.returnDecompressor(poolDecompressor);
  }
}
 
Example 5
Source File: IFile.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Construct an IFile Reader.
 * 
 * @param conf Configuration File 
 * @param in   The input stream
 * @param length Length of the data in the stream, including the checksum
 *               bytes.
 * @param codec codec
 * @param readsCounter Counter for records read from disk
 * @throws IOException
 */
public Reader(Configuration conf, FSDataInputStream in, long length, 
              CompressionCodec codec,
              Counters.Counter readsCounter) throws IOException {
  readRecordsCounter = readsCounter;
  checksumIn = new IFileInputStream(in,length);
  if (codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
    this.in = codec.createInputStream(checksumIn, decompressor);
  } else {
    this.in = checksumIn;
  }
  this.fileLength = length;
  
  if (conf != null) {
    bufferSize = conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
  }
}
 
Example 6
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 7
Source File: WARCFileReader.java    From warc-hadoop with MIT License 5 votes vote down vote up
/**
 * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed
 * on the fly.
 * @param conf The Hadoop configuration.
 * @param filePath The Hadoop path to the file that should be read.
 * @throws IOException
 */
public WARCFileReader(Configuration conf, Path filePath) throws IOException {
    FileSystem fs = filePath.getFileSystem(conf);
    this.fileSize = fs.getFileStatus(filePath).getLen();
    logger.info("Reading from " + filePath);

    CompressionCodec codec = filePath.getName().endsWith(".gz") ?
                             WARCFileWriter.getGzipCodec(conf) : null;
    byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath)));
    dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream));
}
 
Example 8
Source File: HiveColumnCardinalityUpdateJob.java    From Kylin with Apache License 2.0 5 votes vote down vote up
private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}
 
Example 9
Source File: DelimitedTextFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
public DelimitedTextFileReader(LogFilePath path, CompressionCodec codec) throws IOException {
    Path fsPath = new Path(path.getLogFilePath());
    FileSystem fs = FileUtil.getFileSystem(path.getLogFilePath());
    InputStream inputStream = fs.open(fsPath);
    this.mReader = (codec == null) ? new BufferedInputStream(inputStream)
            : new BufferedInputStream(
            codec.createInputStream(inputStream,
                                    mDecompressor = CodecPool.getDecompressor(codec)));
    this.mOffset = path.getOffset();
}
 
Example 10
Source File: HadoopFsHelper.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Returns an {@link InputStream} to the specified file.
 * <p>
 * Note: It is the caller's responsibility to close the returned {@link InputStream}.
 * </p>
 *
 * @param path The path to the file to open.
 * @return An {@link InputStream} for the specified file.
 * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
 */
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
  try {
    Path p = new Path(path);
    InputStream in = this.getFileSystem().open(p);
    // Account for compressed files (e.g. gzip).
    // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
    CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
    CompressionCodec codec = factory.getCodec(p);
    return (codec == null) ? in : codec.createInputStream(in);
  } catch (IOException e) {
    throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
  }
}
 
Example 11
Source File: LineRecordReader.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public LineRecordReader(Configuration job, 
                        FileSplit split) throws IOException {
  this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                                  Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();
  compressionCodecs = new CompressionCodecFactory(job);
  final CompressionCodec codec = compressionCodecs.getCodec(file);

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  boolean skipFirstLine = false;
  if (codec != null) {
    in = new LineReader(codec.createInputStream(fileIn), job);
    end = Long.MAX_VALUE;
  } else {
    if (start != 0) {
      skipFirstLine = true;
      --start;
      fileIn.seek(start);
    }
    in = new LineReader(fileIn, job);
  }
  if (skipFirstLine) {  // skip first line and re-establish "start".
    start += in.readLine(new Text(), 0,
                         (int)Math.min((long)Integer.MAX_VALUE, end - start));
  }
  this.pos = start;
}
 
Example 12
Source File: FSImageUtil.java    From big-c with Apache License 2.0 5 votes vote down vote up
public static InputStream wrapInputStreamForCompression(
    Configuration conf, String codec, InputStream in) throws IOException {
  if (codec.isEmpty())
    return in;

  FSImageCompression compression = FSImageCompression.createCompression(
      conf, codec);
  CompressionCodec imageCodec = compression.getImageCodec();
  return imageCodec.createInputStream(in);
}
 
Example 13
Source File: CompressionFactoryITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private List<String> readFile(File file, CompressionCodec codec) throws Exception {
	try (
			FileInputStream inputStream = new FileInputStream(file);
			InputStreamReader readerStream = new InputStreamReader(codec.createInputStream(inputStream));
			BufferedReader reader = new BufferedReader(readerStream)
	) {
		return reader.lines().collect(Collectors.toList());
	}
}
 
Example 14
Source File: FreightStreamer.java    From RDFS with Apache License 2.0 5 votes vote down vote up
private InputStream decompress(Path p, FileSystem srcFs) throws IOException {
  CompressionCodecFactory factory = new CompressionCodecFactory(getConf());
  CompressionCodec codec = factory.getCodec(p);
  InputStream in = srcFs.open(p);
  if (codec == null) {
    throw new IOException("Cannot find codec for " + p);
  }
  return codec.createInputStream(in);
}
 
Example 15
Source File: Excel97FileRecordReader.java    From components with Apache License 2.0 5 votes vote down vote up
private InputStream createInputStream(Configuration job, final Path file) throws IOException {
  final FileSystem fs = file.getFileSystem(job);
  InputStream in = fs.open(file);

  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null != codec) {
    decompressor = CodecPool.getDecompressor(codec);
    in = codec.createInputStream(in, decompressor);
  }
  return in;
}
 
Example 16
Source File: InterleaveMulti.java    From ViraPipe with MIT License 5 votes vote down vote up
private static void decompress(FileSystem fs, String in, String outpath) throws IOException {

    Configuration conf = new Configuration();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(new Path(in));
    //Decompressing zip file.
    InputStream is = codec.createInputStream(fs.open(new Path(in)));
    OutputStream out = fs.create(new Path(outpath));
    //Write decompressed out
    IOUtils.copyBytes(is, out, conf);
    is.close();
    out.close();
  }
 
Example 17
Source File: TestInsertQuery.java    From tajo with Apache License 2.0 5 votes vote down vote up
@Test
public final void testInsertOverwritePathWithNonFromQuery() throws Exception {
  ResultSet res = executeString("insert overwrite into location " +
      "'/tajo-data/testInsertOverwritePathWithNonFromQuery' " +
      "USING text WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
      "select 1::INT4, 2.1::FLOAT4, 'test'");

  res.close();
  FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
  Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery");
  assertTrue(fs.exists(path));
  assertEquals(1, fs.listStatus(path).length);

  CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());
  FileStatus file = fs.listStatus(path)[0];
  CompressionCodec codec = factory.getCodec(file.getPath());
  assertTrue(codec instanceof DeflateCodec);

  try (BufferedReader reader = new BufferedReader(
          new InputStreamReader(codec.createInputStream(fs.open(file.getPath()))))) {
    String line = reader.readLine();
    assertNotNull(line);

    String[] tokens = line.split("\\|");

    assertEquals(3, tokens.length);
    assertEquals("1", tokens[0]);
    assertEquals("2.1", tokens[1]);
    assertEquals("test", tokens[2]);
  }
}
 
Example 18
Source File: LineRecordReader.java    From big-c with Apache License 2.0 4 votes vote down vote up
public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);
  
  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null!=codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
      final SplitCompressionInputStream cIn =
        ((SplittableCompressionCodec)codec).createInputStream(
          fileIn, decompressor, start, end,
          SplittableCompressionCodec.READ_MODE.BYBLOCK);
      in = new CompressedSplitLineReader(cIn, job,
          this.recordDelimiterBytes);
      start = cIn.getAdjustedStart();
      end = cIn.getAdjustedEnd();
      filePosition = cIn;
    } else {
      in = new SplitLineReader(codec.createInputStream(fileIn,
          decompressor), job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
  } else {
    fileIn.seek(start);
    in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
    filePosition = fileIn;
  }
  // If this is not the first split, we always throw away first record
  // because we always (except the last split) read one extra line in
  // next() method.
  if (start != 0) {
    start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
  this.pos = start;
}
 
Example 19
Source File: LineRecordReader.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);
  
  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null!=codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
      final SplitCompressionInputStream cIn =
        ((SplittableCompressionCodec)codec).createInputStream(
          fileIn, decompressor, start, end,
          SplittableCompressionCodec.READ_MODE.BYBLOCK);
      in = new CompressedSplitLineReader(cIn, job,
          this.recordDelimiterBytes);
      start = cIn.getAdjustedStart();
      end = cIn.getAdjustedEnd();
      filePosition = cIn;
    } else {
      in = new SplitLineReader(codec.createInputStream(fileIn,
          decompressor), job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
  } else {
    fileIn.seek(start);
    in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
    filePosition = fileIn;
  }
  // If this is not the first split, we always throw away first record
  // because we always (except the last split) read one extra line in
  // next() method.
  if (start != 0) {
    start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
  this.pos = start;
}
 
Example 20
Source File: JHLogAnalyzer.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * Collect information about one job.
 * 
 * @param fs - file system
 * @param filePath - full path of a history log file
 * @param offset - starting offset in the history log file
 * @throws IOException
 */
public void parseLogFile(FileSystem fs,
                                Path filePath,
                                long offset,
                                OutputCollector<Text, Text> output,
                                Reporter reporter
                              ) throws IOException {
  InputStream in = null;
  try {
    // open file & seek
    FSDataInputStream stm = fs.open(filePath);
    stm.seek(offset);
    in = stm;
    LOG.info("Opened " + filePath);
    reporter.setStatus("Opened " + filePath);
    // get a compression filter if specified
    if(compressionClass != null) {
      CompressionCodec codec = (CompressionCodec)
        ReflectionUtils.newInstance(compressionClass, new Configuration());
      in = codec.createInputStream(stm);
      LOG.info("Codec created " + filePath);
      reporter.setStatus("Codec created " + filePath);
    }
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    LOG.info("Reader created " + filePath);
    // skip to the next job log start
    long processed = 0L;
    if(jobDelimiterPattern != null) {
      for(String line = reader.readLine();
            line != null; line = reader.readLine()) {
        if((stm.getPos() - processed) > 100000) {
          processed = stm.getPos();
          reporter.setStatus("Processing " + filePath + " at " + processed);
        }
        if(isEndOfJobLog(line))
          break;
      }
    }
    // parse lines and update job history
    JobHistoryLog jh = new JobHistoryLog();
    int jobLineCount = 0;
    for(String line = readLine(reader);
          line != null; line = readLine(reader)) {
      jobLineCount++;
      if((stm.getPos() - processed) > 20000) {
        processed = stm.getPos();
        long numTasks = (jh.tasks == null ? 0 : jh.tasks.size());
        String txt = "Processing " + filePath + " at " + processed
                + " # tasks = " + numTasks;
        reporter.setStatus(txt);
        LOG.info(txt);
      }
      if(isEndOfJobLog(line)) {
        if(jh.JOBID != null) {
          LOG.info("Finished parsing job: " + jh.JOBID
                 + " line count = " + jobLineCount);
          collectJobStats(jh, output, reporter);
          LOG.info("Collected stats for job: " + jh.JOBID);
        }
        jh = new JobHistoryLog();
        jobLineCount = 0;
      } else
        jh.parseLine(line);
    }
    if(jh.JOBID == null) {
      LOG.error("JOBID = NULL in " + filePath + " at " + processed);
      return;
    }
    collectJobStats(jh, output, reporter);
  } catch(Exception ie) {
    // parsing errors can happen if the file has been truncated
    LOG.error("JHLAMapper.parseLogFile", ie);
    reporter.setStatus("JHLAMapper.parseLogFile failed "
                      + StringUtils.stringifyException(ie));
    throw new IOException("Job failed.", ie);
  } finally {
    if(in != null) in.close();
  }
}