Java Code Examples for org.apache.hadoop.io.compress.CodecPool#getDecompressor()

The following examples show how to use org.apache.hadoop.io.compress.CodecPool#getDecompressor() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: MapReduceBitcoinBlockIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example 2

Source File: JsonObjectMapperParser.java From RDFS with Apache License 2.0

6 votes

/**
 * Constructor.
 * 
 * @param path 
 *          Path to the JSON data file, possibly compressed.
 * @param conf
 * @throws IOException
 */
public JsonObjectMapperParser(Path path, Class<? extends T> clazz,
    Configuration conf) throws IOException {
  mapper = new ObjectMapper();
  mapper.configure(
      DeserializationConfig.Feature.CAN_OVERRIDE_ACCESS_MODIFIERS, true);
  this.clazz = clazz;
  FileSystem fs = path.getFileSystem(conf);
  CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
  InputStream input;
  if (codec == null) {
    input = fs.open(path);
    decompressor = null;
  } else {
    FSDataInputStream fsdis = fs.open(path);
    decompressor = CodecPool.getDecompressor(codec);
    input = codec.createInputStream(fsdis, decompressor);
  }
  jsonParser = mapper.getJsonFactory().createJsonParser(input);
}

Example 3

Source File: Compression.java From hadoop-gpu with Apache License 2.0

6 votes

public Decompressor getDecompressor() throws IOException {
  CompressionCodec codec = getCodec();
  if (codec != null) {
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (decompressor != null) {
      if (decompressor.finished()) {
        // Somebody returns the decompressor to CodecPool but is still using
        // it.
        LOG.warn("Deompressor obtained from CodecPool already finished()");
      } else {
        LOG.debug("Got a decompressor: " + decompressor.hashCode());
      }
      /**
       * Following statement is necessary to get around bugs in 0.18 where a
       * decompressor is referenced after returned back to the codec pool.
       */
      decompressor.reset();
    }
    return decompressor;
  }

  return null;
}

Example 4

Source File: MapReduceBitcoinTransactionIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example 5

Source File: PossiblyDecompressedInputStream.java From big-c with Apache License 2.0

6 votes

public PossiblyDecompressedInputStream(Path inputPath, Configuration conf)
    throws IOException {
  CompressionCodecFactory codecs = new CompressionCodecFactory(conf);
  CompressionCodec inputCodec = codecs.getCodec(inputPath);

  FileSystem ifs = inputPath.getFileSystem(conf);
  FSDataInputStream fileIn = ifs.open(inputPath);

  if (inputCodec == null) {
    decompressor = null;
    coreInputStream = fileIn;
  } else {
    decompressor = CodecPool.getDecompressor(inputCodec);
    coreInputStream = inputCodec.createInputStream(fileIn, decompressor);
  }
}

Example 6

Source File: InMemoryMapOutput.java From big-c with Apache License 2.0

6 votes

public InMemoryMapOutput(Configuration conf, TaskAttemptID mapId,
                         MergeManagerImpl<K, V> merger,
                         int size, CompressionCodec codec,
                         boolean primaryMapOutput) {
  super(mapId, (long)size, primaryMapOutput);
  this.conf = conf;
  this.merger = merger;
  this.codec = codec;
  byteStream = new BoundedByteArrayOutputStream(size);
  memory = byteStream.getBuffer();
  if (codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
  } else {
    decompressor = null;
  }
}

Example 7

Source File: Compression.java From hbase with Apache License 2.0

6 votes

public Decompressor getDecompressor() {
  CompressionCodec codec = getCodec(conf);
  if (codec != null) {
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (LOG.isTraceEnabled()) LOG.trace("Retrieved decompressor " + decompressor + " from pool.");
    if (decompressor != null) {
      if (decompressor.finished()) {
        // Somebody returns the decompressor to CodecPool but is still using it.
        LOG.warn("Deompressor obtained from CodecPool is already finished()");
      }
      decompressor.reset();
    }
    return decompressor;
  }

  return null;
}

Example 8

Source File: HadoopLogsAnalyzer.java From RDFS with Apache License 2.0

5 votes

private LineReader maybeUncompressedPath(Path p)
    throws FileNotFoundException, IOException {
  CompressionCodecFactory codecs = new CompressionCodecFactory(getConf());
  inputCodec = codecs.getCodec(p);
  FileSystem fs = p.getFileSystem(getConf());
  FSDataInputStream fileIn = fs.open(p);

  if (inputCodec == null) {
    return new LineReader(fileIn, getConf());
  } else {
    inputDecompressor = CodecPool.getDecompressor(inputCodec);
    return new LineReader(inputCodec.createInputStream(fileIn,
        inputDecompressor), getConf());
  }
}

Example 9

Source File: IFile.java From tez with Apache License 2.0

5 votes

/**
 * Construct an IFile Reader.
 *
 * @param in   The input stream
 * @param length Length of the data in the stream, including the checksum
 *               bytes.
 * @param codec codec
 * @param readsCounter Counter for records read from disk
 * @throws IOException
 */
public Reader(InputStream in, long length,
              CompressionCodec codec,
              TezCounter readsCounter, TezCounter bytesReadCounter,
              boolean readAhead, int readAheadLength,
              int bufferSize, boolean isCompressed) throws IOException {
  if (in != null) {
    checksumIn = new IFileInputStream(in, length, readAhead,
        readAheadLength/* , isCompressed */);
    if (isCompressed && codec != null) {
      decompressor = CodecPool.getDecompressor(codec);
      if (decompressor != null) {
        this.in = codec.createInputStream(checksumIn, decompressor);
      } else {
        LOG.warn("Could not obtain decompressor from CodecPool");
        this.in = checksumIn;
      }
    } else {
      this.in = checksumIn;
    }
    startPos = checksumIn.getPosition();
  } else {
    this.in = null;
  }

  if (in != null) {
    this.dataIn = new DataInputStream(this.in);
  }
  this.readRecordsCounter = readsCounter;
  this.bytesReadCounter = bytesReadCounter;
  this.fileLength = length;
  this.bufferSize = Math.max(0, bufferSize);
}

Example 10

Source File: HadoopLogsAnalyzer.java From hadoop with Apache License 2.0

5 votes

private LineReader maybeUncompressedPath(Path p)
    throws FileNotFoundException, IOException {
  CompressionCodecFactory codecs = new CompressionCodecFactory(getConf());
  inputCodec = codecs.getCodec(p);
  FileSystem fs = p.getFileSystem(getConf());
  FSDataInputStream fileIn = fs.open(p);

  if (inputCodec == null) {
    return new LineReader(fileIn, getConf());
  } else {
    inputDecompressor = CodecPool.getDecompressor(inputCodec);
    return new LineReader(inputCodec.createInputStream(fileIn,
        inputDecompressor), getConf());
  }
}

Example 11

Source File: AbstractBitcoinRecordReader.java From hadoopcryptoledger with Apache License 2.0

5 votes

/**
* Initializes reader
* @param split Split to use (assumed to be a file split)
* @param context context of the job
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws java.lang.InterruptedException in case of thread interruption
*
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
   FileSplit fSplit = (FileSplit)split;
 // Initialize start and end of split
    start = fSplit.getStart();
    end = start + fSplit.getLength();
    final Path file = fSplit.getPath();
    codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    final FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
				bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);
		start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	bbr = new BitcoinBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,readAuxPOW);
	filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,readAuxPOW);  
      filePosition = fileIn;
    }
    // seek to block start (for the case a block overlaps a split)
    try {
    	bbr.seekBlockStart();
    } catch (BitcoinBlockReadException bbre) {
		LOG.error("Error reading Bitcoin blockchhain data");
		LOG.error(bbre);
    } 
}

Example 12

Source File: MainframeVBRecordReader.java From Cobol-to-Hive with Apache License 2.0

5 votes

public void initialize(Configuration job, long splitStart,
		long splitLength, Path file) throws IOException {

	start = splitStart;
	end = start + splitLength;
	LOG.info("Start of the split:" + start + "-End of split:" + end);
	LOG.debug("VLR initialize started: start pos:" + start + "endpos:"
			+ end);

	// open the file and seek to the start of the split
	final FileSystem fs = file.getFileSystem(job);
	fileIn = fs.open(file);

	CompressionCodec codec = new CompressionCodecFactory(job)
			.getCodec(file);
	if (null != codec) {
		isCompressedInput = true;
		decompressor = CodecPool.getDecompressor(codec);
		CompressionInputStream cIn = codec.createInputStream(fileIn,
				decompressor);
		filePosition = (Seekable) cIn;
		inputStream = cIn;
		LOG.info("Compressed input; cannot compute number of records in the split");
	} else {
		fileIn.seek(start);
		filePosition = fileIn;
		inputStream = fileIn;
		numBytesRemainingInSplit = splitLength;
		LOG.info("Variable length input; cannot compute number of records in the split");

	}
	this.pos = start;
}

Example 13

Source File: TestExport.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

private void verifyCompressedFile(Path f, int expectedNumLines)
    throws IOException {
  Configuration conf = new Configuration();
  if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
    conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
  }
  FileSystem fs = FileSystem.get(conf);
  InputStream is = fs.open(f);
  CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
  CompressionCodec codec = ccf.getCodec(f);
  LOG.info("gzip check codec is " + codec);
  Decompressor decompressor = CodecPool.getDecompressor(codec);
  if (null == decompressor) {
    LOG.info("Verifying gzip sanity with null decompressor");
  } else {
    LOG.info("Verifying gzip sanity with decompressor: "
        + decompressor.toString());
  }
  is = codec.createInputStream(is, decompressor);
  BufferedReader r = new BufferedReader(new InputStreamReader(is));
  int numLines = 0;
  while (true) {
    String ln = r.readLine();
    if (ln == null) {
      break;
    }
    numLines++;
  }

  r.close();
  assertEquals("Did not read back correct number of lines",
      expectedNumLines, numLines);
  LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
}

Example 14

Source File: IFile.java From tez with Apache License 2.0

4 votes

/**
 * Read entire ifile content to memory.
 *
 * @param buffer
 * @param in
 * @param compressedLength
 * @param codec
 * @param ifileReadAhead
 * @param ifileReadAheadLength
 * @throws IOException
 */
public static void readToMemory(byte[] buffer, InputStream in, int compressedLength,
    CompressionCodec codec, boolean ifileReadAhead, int ifileReadAheadLength)
    throws IOException {
  boolean isCompressed = IFile.Reader.isCompressedFlagEnabled(in);
  IFileInputStream checksumIn = new IFileInputStream(in,
      compressedLength - IFile.HEADER.length, ifileReadAhead,
      ifileReadAheadLength);
  in = checksumIn;
  Decompressor decompressor = null;
  if (isCompressed && codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
    if (decompressor != null) {
      decompressor.reset();
      in = getDecompressedInputStreamWithBufferSize(codec, checksumIn, decompressor, compressedLength);
    } else {
      LOG.warn("Could not obtain decompressor from CodecPool");
      in = checksumIn;
    }
  }
  try {
    IOUtils.readFully(in, buffer, 0, buffer.length - IFile.HEADER.length);
    /*
     * We've gotten the amount of data we were expecting. Verify the
     * decompressor has nothing more to offer. This action also forces the
     * decompressor to read any trailing bytes that weren't critical for
     * decompression, which is necessary to keep the stream in sync.
     */
    if (in.read() >= 0) {
      throw new IOException("Unexpected extra bytes from input stream");
    }
  } catch (IOException ioe) {
    if(in != null) {
      try {
        in.close();
      } catch(IOException e) {
        if(LOG.isDebugEnabled()) {
          LOG.debug("Exception in closing " + in, e);
        }
      }
    }
    throw ioe;
  } finally {
    if (decompressor != null) {
      decompressor.reset();
      CodecPool.returnDecompressor(decompressor);
    }
  }
}

Example 15

Source File: FixedLengthRecordReader.java From big-c with Apache License 2.0

4 votes

public void initialize(Configuration job, long splitStart, long splitLength,
                       Path file) throws IOException {
  start = splitStart;
  end = start + splitLength;
  long partialRecordLength = start % recordLength;
  long numBytesToSkip = 0;
  if (partialRecordLength != 0) {
    numBytesToSkip = recordLength - partialRecordLength;
  }

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);

  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null != codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    CompressionInputStream cIn
        = codec.createInputStream(fileIn, decompressor);
    filePosition = cIn;
    inputStream = cIn;
    numRecordsRemainingInSplit = Long.MAX_VALUE;
    LOG.info(
        "Compressed input; cannot compute number of records in the split");
  } else {
    fileIn.seek(start);
    filePosition = fileIn;
    inputStream = fileIn;
    long splitSize = end - start - numBytesToSkip;
    numRecordsRemainingInSplit = (splitSize + recordLength - 1)/recordLength;
    if (numRecordsRemainingInSplit < 0) {
      numRecordsRemainingInSplit = 0;
    }
    LOG.info("Expecting " + numRecordsRemainingInSplit
        + " records each with a length of " + recordLength
        + " bytes in the split with an effective size of "
        + splitSize + " bytes");
  }
  if (numBytesToSkip != 0) {
    start += inputStream.skip(numBytesToSkip);
  }
  this.pos = start;
}

Example 16

Source File: LineRecordReader.java From big-c with Apache License 2.0

4 votes

public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);
  
  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null!=codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
      final SplitCompressionInputStream cIn =
        ((SplittableCompressionCodec)codec).createInputStream(
          fileIn, decompressor, start, end,
          SplittableCompressionCodec.READ_MODE.BYBLOCK);
      in = new CompressedSplitLineReader(cIn, job,
          this.recordDelimiterBytes);
      start = cIn.getAdjustedStart();
      end = cIn.getAdjustedEnd();
      filePosition = cIn;
    } else {
      in = new SplitLineReader(codec.createInputStream(fileIn,
          decompressor), job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
  } else {
    fileIn.seek(start);
    in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
    filePosition = fileIn;
  }
  // If this is not the first split, we always throw away first record
  // because we always (except the last split) read one extra line in
  // next() method.
  if (start != 0) {
    start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
  this.pos = start;
}

Example 17

Source File: LineRecordReader.java From hadoop with Apache License 2.0

4 votes

public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  final FileSystem fs = file.getFileSystem(job);
  fileIn = fs.open(file);
  
  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
  if (null!=codec) {
    isCompressedInput = true;	
    decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
      final SplitCompressionInputStream cIn =
        ((SplittableCompressionCodec)codec).createInputStream(
          fileIn, decompressor, start, end,
          SplittableCompressionCodec.READ_MODE.BYBLOCK);
      in = new CompressedSplitLineReader(cIn, job,
          this.recordDelimiterBytes);
      start = cIn.getAdjustedStart();
      end = cIn.getAdjustedEnd();
      filePosition = cIn;
    } else {
      in = new SplitLineReader(codec.createInputStream(fileIn,
          decompressor), job, this.recordDelimiterBytes);
      filePosition = fileIn;
    }
  } else {
    fileIn.seek(start);
    in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
    filePosition = fileIn;
  }
  // If this is not the first split, we always throw away first record
  // because we always (except the last split) read one extra line in
  // next() method.
  if (start != 0) {
    start += in.readLine(new Text(), 0, maxBytesToConsume(start));
  }
  this.pos = start;
}

Example 18

Source File: AbstractEthereumRecordReader.java From hadoopcryptoledger with Apache License 2.0

4 votes

/**
* Creates an Abstract Record Reader for Ethereum blocks
* @param split Split to use (assumed to be a file split)
* @param job Configuration:
 * io.file.buffer.size: Size of in-memory  specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize will be used. Furthermore, one may specify hadoopcryptoledger.ethereumblockinputformat.maxblocksize, which defines the maximum size a Ethereum block may have. By default it is 1M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.ethereumblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. 
* @param reporter Reporter
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
*
*/
public AbstractEthereumRecordReader(FileSplit split,JobConf job, Reporter reporter) throws IOException {
    LOG.debug("Reading configuration");
    // parse configuration
     this.reporter=reporter;
     this.conf=job;	
	this.maxSizeEthereumBlock=conf.getInt(AbstractEthereumRecordReader.CONF_MAXBLOCKSIZE,AbstractEthereumRecordReader.DEFAULT_MAXSIZE_ETHEREUMBLOCK);
	this.bufferSize=conf.getInt(AbstractEthereumRecordReader.CONF_BUFFERSIZE,AbstractEthereumRecordReader.DEFAULT_BUFFERSIZE);

	this.useDirectBuffer=conf.getBoolean(AbstractEthereumRecordReader.CONF_USEDIRECTBUFFER,AbstractEthereumRecordReader.DEFAULT_USEDIRECTBUFFER);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    codec = new CompressionCodecFactory(job).getCodec(file);
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
	LOG.debug("Decompressing file");
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		LOG.debug("SplittableCompressionCodec");
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
        	ebr = new EthereumBlockReader(cIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
  			start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	LOG.debug("Not-splitable compression codec");
  	ebr = new EthereumBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
            filePosition = fileIn;
      }
    } else {
      LOG.debug("Processing file without compression");
      fileIn.seek(start);
      ebr = new EthereumBlockReader(fileIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	     filePosition = fileIn;
    }
    // initialize reader

    this.reporter.setStatus("Ready to read");
}

Example 19

Source File: AbstractBitcoinRecordReader.java From hadoopcryptoledger with Apache License 2.0

4 votes

/**
* Creates an Abstract Record Reader for Bitcoin blocks
* @param split Split to use (assumed to be a file split)
* @param job Configuration:
* io.file.buffer.size: Size of in-memory  specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize (maximum size of a bitcoin block) will be used. The configuration hadoopcryptoledger.bitcoinblockinputformat.filter.magic allows specifying the magic identifier of the block. The magic is a comma-separated list of Hex-values (e.g. F9BEB4D9,FABFB5DA,0B110907,0B110907). The default magic is always F9BEB4D9. One needs to specify at least one magic, otherwise it will be difficult to find blocks in splits. Furthermore, one may specify hadoopcryptoledger.bitcoinblockinputformat.maxblocksize, which defines the maximum size a bitcoin block may have. By default it is 8M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.bitcoinblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. 
* @param reporter Reporter
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.bitcoin.format.exception.HadoopCryptoLedgerConfigurationException in case of an invalid HadoopCryptoLedger-specific configuration of the inputformat
* @throws org.zuinnote.hadoop.bitcoin.format.exception.BitcoinBlockReadException in case the Bitcoin data contains invalid blocks (e.g. magic might be different)
*
*/
public AbstractBitcoinRecordReader(FileSplit split,JobConf job, Reporter reporter) throws IOException,HadoopCryptoLedgerConfigurationException,BitcoinBlockReadException {
    LOG.debug("Reading configuration");
    // parse configuration
     this.reporter=reporter;
     this.conf=job;	
	this.maxSizeBitcoinBlock=conf.getInt(AbstractBitcoinRecordReader.CONF_MAXBLOCKSIZE,AbstractBitcoinRecordReader.DEFAULT_MAXSIZE_BITCOINBLOCK);
	this.bufferSize=conf.getInt(AbstractBitcoinRecordReader.CONF_BUFFERSIZE,AbstractBitcoinRecordReader.DEFAULT_BUFFERSIZE);
	this.specificMagic=conf.get(AbstractBitcoinRecordReader.CONF_FILTERMAGIC);
	// we need to provide at least 
	if ((this.specificMagic==null) || (this.specificMagic.length()==0)) {
		 this.specificMagic=AbstractBitcoinRecordReader.DEFAULT_MAGIC;
	}
	if ((this.specificMagic!=null) && (this.specificMagic.length()>0)) {
		this.specificMagicStringArray=specificMagic.split(",");
		specificMagicByteArray=new byte[specificMagicStringArray.length][4]; // each magic is always 4 byte
		for (int i=0;i<specificMagicStringArray.length;i++) {
				byte[] currentMagicNo=BitcoinUtil.convertHexStringToByteArray(specificMagicStringArray[i]);
				if (currentMagicNo.length!=4) {
					throw new HadoopCryptoLedgerConfigurationException("Error: Configuration. Magic number has not a length of 4 bytes. Index: "+i);
				}
				specificMagicByteArray[i]=currentMagicNo;
		}
	}	
	this.useDirectBuffer=conf.getBoolean(AbstractBitcoinRecordReader.CONF_USEDIRECTBUFFER,AbstractBitcoinRecordReader.DEFAULT_USEDIRECTBUFFER);
	this.readAuxPOW=conf.getBoolean(AbstractBitcoinRecordReader.CONF_READAUXPOW,AbstractBitcoinRecordReader.DEFAULT_READAUXPOW);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    codec = new CompressionCodecFactory(job).getCodec(file);
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
	LOG.debug("Decompressing file");
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		LOG.debug("SplittableCompressionCodec");
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
		bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);  
		start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	LOG.debug("Not-splitable compression codec");
	bbr = new BitcoinBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);
        filePosition = fileIn;
      }
    } else {
      LOG.debug("Processing file without compression");
      fileIn.seek(start);
      bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);  
      filePosition = fileIn;
    }
    // initialize reader
    // seek to block start (for the case a block overlaps a split)
    LOG.debug("Seeking to block start");
    this.reporter.setStatus("Seeking Block start");
    bbr.seekBlockStart();
    this.reporter.setStatus("Ready to read");
}

Example 20

Source File: ChunkRecordReader.java From pxf with Apache License 2.0

4 votes

/**
 * Constructs a ChunkRecordReader instance.
 *
 * @param job the job configuration
 * @param split contains the file name, begin byte of the split and the
 *            bytes length
 * @throws IOException if an I/O error occurs when accessing the file or
 *             creating input stream to read from it
 */
public ChunkRecordReader(Configuration job, FileSplit split)
        throws IOException, IncompatibleInputStreamException {
    maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    validateLength(maxLineLength);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // openForWrite the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file, ChunkReader.DEFAULT_BUFFER_SIZE);
    fileLength = getInputStream().getFileLength();
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new ChunkReader(cIn);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new ChunkReader(codec.createInputStream(fileIn,
                    decompressor));
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new ChunkReader(fileIn);
        filePosition = fileIn;
    }
    /*
     * If this is not the first split, we always throw away first record
     * because we always (except the last split) read one extra line in
     * next() method.
     */
    if (start != 0) {
        start += in.readLine(new ChunkWritable(), maxBytesToConsume(start));
    }
    this.pos = start;
}