Java Code Examples for org.apache.hadoop.io.compress.CompressionCodec#createInputStream()

The following examples show how to use org.apache.hadoop.io.compress.CompressionCodec#createInputStream() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HadoopFileReader.java From hadoopoffice with Apache License 2.0

6 votes

public InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=compressionCodecs.getCodec(path);
 	FSDataInputStream fileIn=fs.open(path);
	// check if compressed
	if (codec==null) { // uncompressed
	LOG.debug("Reading from an uncompressed file \""+path+"\"");
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			LOG.debug("Reading from a compressed file \""+path+"\" with splittable compression codec");
			long end = fs.getFileStatus(path).getLen(); 
        		return ((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
      		} else {
			LOG.debug("Reading from a compressed file \""+path+"\" with non-splittable compression codec");
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example 2

Source File: IFile.java From tez with Apache License 2.0

6 votes

private static InputStream getDecompressedInputStreamWithBufferSize(CompressionCodec codec,
    IFileInputStream checksumIn, Decompressor decompressor, int compressedLength)
    throws IOException {
  String bufferSizeProp = TezRuntimeUtils.getBufferSizeProperty(codec);

  if (bufferSizeProp != null) {
    Configurable configurableCodec = (Configurable) codec;
    Configuration conf = configurableCodec.getConf();

    int bufSize = Math.min(compressedLength, DEFAULT_BUFFER_SIZE);
    LOG.trace("buffer size was set according to min(compressedLength, {}): {}={}",
        DEFAULT_BUFFER_SIZE, bufferSizeProp, bufSize);
    conf.setInt(bufferSizeProp, bufSize);
  }

  return codec.createInputStream(checksumIn, decompressor);
}

Example 3

Source File: MapReduceBitcoinBlockIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example 4

Source File: CellBlockBuilder.java From hbase with Apache License 2.0

6 votes

private ByteBuffer decompress(CompressionCodec compressor, InputStream cellBlockStream,
    int osInitialSize) throws IOException {
  // GZIPCodec fails w/ NPE if no configuration.
  if (compressor instanceof Configurable) {
    ((Configurable) compressor).setConf(this.conf);
  }
  Decompressor poolDecompressor = CodecPool.getDecompressor(compressor);
  CompressionInputStream cis = compressor.createInputStream(cellBlockStream, poolDecompressor);
  ByteBufferOutputStream bbos;
  try {
    // TODO: This is ugly. The buffer will be resized on us if we guess wrong.
    // TODO: Reuse buffers.
    bbos = new ByteBufferOutputStream(osInitialSize);
    IOUtils.copy(cis, bbos);
    bbos.close();
    return bbos.getByteBuffer();
  } finally {
    CodecPool.returnDecompressor(poolDecompressor);
  }
}

Example 5

Source File: IFile.java From hadoop-gpu with Apache License 2.0

6 votes

/**
 * Construct an IFile Reader.
 * 
 * @param conf Configuration File 
 * @param in   The input stream
 * @param length Length of the data in the stream, including the checksum
 *               bytes.
 * @param codec codec
 * @param readsCounter Counter for records read from disk
 * @throws IOException
 */
public Reader(Configuration conf, FSDataInputStream in, long length, 
              CompressionCodec codec,
              Counters.Counter readsCounter) throws IOException {
  readRecordsCounter = readsCounter;
  checksumIn = new IFileInputStream(in,length);
  if (codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
    this.in = codec.createInputStream(checksumIn, decompressor);
  } else {
    this.in = checksumIn;
  }
  this.fileLength = length;
  
  if (conf != null) {
    bufferSize = conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE);
  }
}

Example 6

Source File: QseqInputFormat.java From Hadoop-BAM with MIT License

5 votes

public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}

Example 7

Source File: WARCFileReader.java From warc-hadoop with MIT License

5 votes

/**
 * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed
 * on the fly.
 * @param conf The Hadoop configuration.
 * @param filePath The Hadoop path to the file that should be read.
 * @throws IOException
 */
public WARCFileReader(Configuration conf, Path filePath) throws IOException {
    FileSystem fs = filePath.getFileSystem(conf);
    this.fileSize = fs.getFileStatus(filePath).getLen();
    logger.info("Reading from " + filePath);

    CompressionCodec codec = filePath.getName().endsWith(".gz") ?
                             WARCFileWriter.getGzipCodec(conf) : null;
    byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath)));
    dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream));
}

Example 8

Source File: HiveColumnCardinalityUpdateJob.java From Kylin with Apache License 2.0

5 votes

private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

Example 9

Source File: DelimitedTextFileReaderWriterFactory.java From secor with Apache License 2.0

5 votes

public DelimitedTextFileReader(LogFilePath path, CompressionCodec codec) throws IOException {
    Path fsPath = new Path(path.getLogFilePath());
    FileSystem fs = FileUtil.getFileSystem(path.getLogFilePath());
    InputStream inputStream = fs.open(fsPath);
    this.mReader = (codec == null) ? new BufferedInputStream(inputStream)
            : new BufferedInputStream(
            codec.createInputStream(inputStream,
                                    mDecompressor = CodecPool.getDecompressor(codec)));
    this.mOffset = path.getOffset();
}

Example 10

Source File: HadoopFsHelper.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Returns an {@link InputStream} to the specified file.
 * <p>
 * Note: It is the caller's responsibility to close the returned {@link InputStream}.
 * </p>
 *
 * @param path The path to the file to open.
 * @return An {@link InputStream} for the specified file.
 * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file.
 */
@Override
public InputStream getFileStream(String path) throws FileBasedHelperException {
  try {
    Path p = new Path(path);
    InputStream in = this.getFileSystem().open(p);
    // Account for compressed files (e.g. gzip).
    // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala
    CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf());
    CompressionCodec codec = factory.getCodec(p);
    return (codec == null) ? in : codec.createInputStream(in);
  } catch (IOException e) {
    throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e);
  }
}

Example 11

Source File: LineRecordReader.java From RDFS with Apache License 2.0

5 votes

public LineRecordReader(Configuration job, 
                        FileSplit split) throws IOException {
  this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                                  Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();
  compressionCodecs = new CompressionCodecFactory(job);
  final CompressionCodec codec = compressionCodecs.getCodec(file);

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  boolean skipFirstLine = false;
  if (codec != null) {
    in = new LineReader(codec.createInputStream(fileIn), job);
    end = Long.MAX_VALUE;
  } else {
    if (start != 0) {
      skipFirstLine = true;
      --start;
      fileIn.seek(start);
    }
    in = new LineReader(fileIn, job);
  }
  if (skipFirstLine) {  // skip first line and re-establish "start".
    start += in.readLine(new Text(), 0,
                         (int)Math.min((long)Integer.MAX_VALUE, end - start));
  }
  this.pos = start;
}

Example 12

Source File: FSImageUtil.java From big-c with Apache License 2.0

5 votes

public static InputStream wrapInputStreamForCompression(
    Configuration conf, String codec, InputStream in) throws IOException {
  if (codec.isEmpty())
    return in;

  FSImageCompression compression = FSImageCompression.createCompression(
      conf, codec);
  CompressionCodec imageCodec = compression.getImageCodec();
  return imageCodec.createInputStream(in);
}

Example 13

Source File: CompressionFactoryITCase.java From flink with Apache License 2.0

5 votes

private List<String> readFile(File file, CompressionCodec codec) throws Exception {
	try (
			FileInputStream inputStream = new FileInputStream(file);
			InputStreamReader readerStream = new InputStreamReader(codec.createInputStream(inputStream));
			BufferedReader reader = new BufferedReader(readerStream)
	) {
		return reader.lines().collect(Collectors.toList());
	}
}

Example 14

Source File: FreightStreamer.java From RDFS with Apache License 2.0

5 votes

private InputStream decompress(Path p, FileSystem srcFs) throws IOException {
  CompressionCodecFactory factory = new CompressionCodecFactory(getConf());
  CompressionCodec codec = factory.getCodec(p);
  InputStream in = srcFs.open(p);
  if (codec == null) {
    throw new IOException("Cannot find codec for " + p);
  }
  return codec.createInputStream(in);
}

Example 15

Source File: Excel97FileRecordReader.java From components with Apache License 2.0

5 votes

private InputStream createInputStream(Configuration job, final Path file) throws IOException {
  final FileSystem fs = file.getFileSystem(job);
  InputStream in = fs.open(file);

  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null != codec) {
    decompressor = CodecPool.getDecompressor(codec);
    in = codec.createInputStream(in, decompressor);
  }
  return in;
}

Example 16

Source File: InterleaveMulti.java From ViraPipe with MIT License

5 votes

private static void decompress(FileSystem fs, String in, String outpath) throws IOException {

    Configuration conf = new Configuration();
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(new Path(in));
    //Decompressing zip file.
    InputStream is = codec.createInputStream(fs.open(new Path(in)));
    OutputStream out = fs.create(new Path(outpath));
    //Write decompressed out
    IOUtils.copyBytes(is, out, conf);
    is.close();
    out.close();
  }

Example 17

Source File: TestInsertQuery.java From tajo with Apache License 2.0

5 votes

@Test
public final void testInsertOverwritePathWithNonFromQuery() throws Exception {
  ResultSet res = executeString("insert overwrite into location " +
      "'/tajo-data/testInsertOverwritePathWithNonFromQuery' " +
      "USING text WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
      "select 1::INT4, 2.1::FLOAT4, 'test'");

  res.close();
  FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
  Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery");
  assertTrue(fs.exists(path));
  assertEquals(1, fs.listStatus(path).length);

  CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());
  FileStatus file = fs.listStatus(path)[0];
  CompressionCodec codec = factory.getCodec(file.getPath());
  assertTrue(codec instanceof DeflateCodec);

  try (BufferedReader reader = new BufferedReader(
          new InputStreamReader(codec.createInputStream(fs.open(file.getPath()))))) {
    String line = reader.readLine();
    assertNotNull(line);

    String[] tokens = line.split("\\|");

    assertEquals(3, tokens.length);
    assertEquals("1", tokens[0]);
    assertEquals("2.1", tokens[1]);
    assertEquals("test", tokens[2]);
  }
}

Example 18

Source File: LineRecordReader.java From big-c with Apache License 2.0

4 votes

public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);
  
  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null!=codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
      final SplitCompressionInputStream cIn =
        ((SplittableCompressionCodec)codec).createInputStream(
          fileIn, decompressor, start, end,
          SplittableCompressionCodec.READ_MODE.BYBLOCK);
      in = new CompressedSplitLineReader(cIn, job,
          this.recordDelimiterBytes);
      start = cIn.getAdjustedStart();
      end = cIn.getAdjustedEnd();
      filePosition = cIn;
    } else {
      in = new SplitLineReader(codec.createInputStream(fileIn,
          decompressor), job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
  } else {
    fileIn.seek(start);
    in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
    filePosition = fileIn;
  }
  // If this is not the first split, we always throw away first record
  // because we always (except the last split) read one extra line in
  // next() method.
  if (start != 0) {
    start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
  this.pos = start;
}

Example 19

Source File: LineRecordReader.java From hadoop with Apache License 2.0

4 votes

public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);
  
  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null!=codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
      final SplitCompressionInputStream cIn =
        ((SplittableCompressionCodec)codec).createInputStream(
          fileIn, decompressor, start, end,
          SplittableCompressionCodec.READ_MODE.BYBLOCK);
      in = new CompressedSplitLineReader(cIn, job,
          this.recordDelimiterBytes);
      start = cIn.getAdjustedStart();
      end = cIn.getAdjustedEnd();
      filePosition = cIn;
    } else {
      in = new SplitLineReader(codec.createInputStream(fileIn,
          decompressor), job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
  } else {
    fileIn.seek(start);
    in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
    filePosition = fileIn;
  }
  // If this is not the first split, we always throw away first record
  // because we always (except the last split) read one extra line in
  // next() method.
  if (start != 0) {
    start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
  this.pos = start;
}

Example 20

Source File: JHLogAnalyzer.java From big-c with Apache License 2.0

4 votes

/**
 * Collect information about one job.
 * 
 * @param fs - file system
 * @param filePath - full path of a history log file
 * @param offset - starting offset in the history log file
 * @throws IOException
 */
public void parseLogFile(FileSystem fs,
                                Path filePath,
                                long offset,
                                OutputCollector<Text, Text> output,
                                Reporter reporter
                              ) throws IOException {
  InputStream in = null;
  try {
    // open file & seek
    FSDataInputStream stm = fs.open(filePath);
    stm.seek(offset);
    in = stm;
    LOG.info("Opened " + filePath);
    reporter.setStatus("Opened " + filePath);
    // get a compression filter if specified
    if(compressionClass != null) {
      CompressionCodec codec = (CompressionCodec)
        ReflectionUtils.newInstance(compressionClass, new Configuration());
      in = codec.createInputStream(stm);
      LOG.info("Codec created " + filePath);
      reporter.setStatus("Codec created " + filePath);
    }
    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
    LOG.info("Reader created " + filePath);
    // skip to the next job log start
    long processed = 0L;
    if(jobDelimiterPattern != null) {
      for(String line = reader.readLine();
            line != null; line = reader.readLine()) {
        if((stm.getPos() - processed) > 100000) {
          processed = stm.getPos();
          reporter.setStatus("Processing " + filePath + " at " + processed);
        }
        if(isEndOfJobLog(line))
          break;
      }
    }
    // parse lines and update job history
    JobHistoryLog jh = new JobHistoryLog();
    int jobLineCount = 0;
    for(String line = readLine(reader);
          line != null; line = readLine(reader)) {
      jobLineCount++;
      if((stm.getPos() - processed) > 20000) {
        processed = stm.getPos();
        long numTasks = (jh.tasks == null ? 0 : jh.tasks.size());
        String txt = "Processing " + filePath + " at " + processed
                + " # tasks = " + numTasks;
        reporter.setStatus(txt);
        LOG.info(txt);
      }
      if(isEndOfJobLog(line)) {
        if(jh.JOBID != null) {
          LOG.info("Finished parsing job: " + jh.JOBID
                 + " line count = " + jobLineCount);
          collectJobStats(jh, output, reporter);
          LOG.info("Collected stats for job: " + jh.JOBID);
        }
        jh = new JobHistoryLog();
        jobLineCount = 0;
      } else
        jh.parseLine(line);
    }
    if(jh.JOBID == null) {
      LOG.error("JOBID = NULL in " + filePath + " at " + processed);
      return;
    }
    collectJobStats(jh, output, reporter);
  } catch(Exception ie) {
    // parsing errors can happen if the file has been truncated
    LOG.error("JHLAMapper.parseLogFile", ie);
    reporter.setStatus("JHLAMapper.parseLogFile failed "
                      + StringUtils.stringifyException(ie));
    throw new IOException("Job failed.", ie);
  } finally {
    if(in != null) in.close();
  }
}