org.apache.hadoop.io.compress.SplittableCompressionCodec Java Examples

The following examples show how to use org.apache.hadoop.io.compress.SplittableCompressionCodec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: MapReduceBitcoinBlockIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #2

Source File: SplittableXmlInputFormat.java From Hive-XML-SerDe with Apache License 2.0

6 votes

private InputStream getInputStream(JobConf jobConf, FileSplit split) throws IOException, ClassNotFoundException {
    FSDataInputStream fsin = null;

    // open the file and seek to the start of the split
    long splitStart = split.getStart();
    long splitEnd = splitStart + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(jobConf);
    fsin = fs.open(split.getPath());
    fsin.seek(splitStart);

    Configuration conf = new Configuration();
    CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecFactory.getCodec(split.getPath());
    Decompressor decompressor = CodecPool.getDecompressor(codec);
    if (codec instanceof SplittableCompressionCodec) {
        return ((SplittableCompressionCodec) codec).createInputStream(fsin,
            decompressor,
            splitStart,
            splitEnd,
            SplittableCompressionCodec.READ_MODE.BYBLOCK);
    } else {
        return codec.createInputStream(fsin, decompressor);
    }
}

Example #3

Source File: Spark2BitcoinBlockCounterSparkMasterIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(conf).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #4

Source File: MapReduceEthereumBlockIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #5

Source File: MapReduceBitcoinTransactionIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #6

Source File: SparkBitcoinBlockCounterSparkMasterIntegrationTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(conf).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #7

Source File: HadoopFileReader.java From hadoopoffice with Apache License 2.0

6 votes

public InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=compressionCodecs.getCodec(path);
 	FSDataInputStream fileIn=fs.open(path);
	// check if compressed
	if (codec==null) { // uncompressed
	LOG.debug("Reading from an uncompressed file \""+path+"\"");
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			LOG.debug("Reading from a compressed file \""+path+"\" with splittable compression codec");
			long end = fs.getFileStatus(path).getLen(); 
        		return ((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
      		} else {
			LOG.debug("Reading from a compressed file \""+path+"\" with non-splittable compression codec");
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #8

Source File: MapReduceExcelInputIntegrationTest.java From hadoopoffice with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #9

Source File: MapReduceExcelOutputIntegrationTest.java From hadoopoffice with Apache License 2.0

6 votes

private InputStream openFile(Path path) throws IOException {
        CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path);
 	FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path);
	// check if compressed
	if (codec==null) { // uncompressed
		return fileIn;
	} else { // compressed
		Decompressor decompressor = CodecPool.getDecompressor(codec);
		this.openDecompressors.add(decompressor); // to be returned later using close
		if (codec instanceof SplittableCompressionCodec) {
			long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); 
        		final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
					return cIn;
      		} else {
        		return codec.createInputStream(fileIn,decompressor);
      		}
	}
}

Example #10

Source File: CSVFileInputFormat.java From components with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path filename) {
  String text_enclosure = context.getConfiguration().get(TALEND_TEXT_ENCLOSURE);
  String talend_escape = context.getConfiguration().get(TALEND_ESCAPE);

  if ((text_enclosure != null && !text_enclosure.isEmpty()) || (talend_escape != null && !talend_escape.isEmpty())) {
    return false;
  }

  final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #11

Source File: AbstractBitcoinRecordReader.java From hadoopcryptoledger with Apache License 2.0

5 votes

/**
* Initializes reader
* @param split Split to use (assumed to be a file split)
* @param context context of the job
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws java.lang.InterruptedException in case of thread interruption
*
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
   FileSplit fSplit = (FileSplit)split;
 // Initialize start and end of split
    start = fSplit.getStart();
    end = start + fSplit.getLength();
    final Path file = fSplit.getPath();
    codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    final FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
				bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);
		start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	bbr = new BitcoinBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,readAuxPOW);
	filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,readAuxPOW);  
      filePosition = fileIn;
    }
    // seek to block start (for the case a block overlaps a split)
    try {
    	bbr.seekBlockStart();
    } catch (BitcoinBlockReadException bbre) {
		LOG.error("Error reading Bitcoin blockchhain data");
		LOG.error(bbre);
    } 
}

Example #12

Source File: AbstractBitcoinFileInputFormat.java From hadoopcryptoledger with Apache License 2.0

5 votes

/**
*
* This method is experimental and derived from TextInputFormat. It is not necessary and not recommended to compress the blockchain files. Instead it is recommended to extract relevant data from the blockchain files once and store them in a format suitable for analytics (including compression), such as ORC or Parquet.
*
*/
@Override
  protected boolean isSplitable(FileSystem fs, Path file) {
    if (!(this.isSplitable)) {
		return false;
    }
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (null == codec) {
      return true;
    }
    return codec instanceof SplittableCompressionCodec;

  }

Example #13

Source File: ExampleHttpJsonLineReader.java From tajo with Apache License 2.0

5 votes

public ExampleHttpJsonLineReader(Configuration conf,
                                 AbstractFileFragment fragment,
                                 int bufferSize) {
  this.conf = conf;
  this.fragment = (ExampleHttpFileFragment) fragment;
  this.bufferSize = bufferSize;

  CompressionCodecFactory factory = new CompressionCodecFactory(conf);
  codec = factory.getCodec(fragment.getPath());
  if (this.codec instanceof SplittableCompressionCodec) {
    // bzip2 does not support multi-thread model
    throw new TajoRuntimeException(new UnsupportedException(codec.getDefaultExtension()));
  }
}

Example #14

Source File: AbstractEthereumRecordReader.java From hadoopcryptoledger with Apache License 2.0

5 votes

/***
 * Initializes readers
 * 
 * @param split Split to be used (asssumed to be a file split)
 * œüaram context context of the job
 * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
 * @throws java.lang.InterruptedException in case of thread interruption
 * 
 */

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
	   FileSplit fSplit = (FileSplit)split;
	   // Initialize start and end of split
	      start = fSplit.getStart();
	      end = start + fSplit.getLength();
	      final Path file = fSplit.getPath();
	      codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
	      final FileSystem fs = file.getFileSystem(context.getConfiguration());
	      FSDataInputStream fileIn = fs.open(file);
	      // open stream
	        if (isCompressedInput()) { // decompress
	        	decompressor = CodecPool.getDecompressor(codec);
	        	if (codec instanceof SplittableCompressionCodec) {
	  		
	          	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
	  				ebr = new EthereumBlockReader(cIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	  				start = cIn.getAdjustedStart();
	         		end = cIn.getAdjustedEnd();
	          	filePosition = cIn; // take pos from compressed stream
	        } else {
	        	ebr = new EthereumBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	        	filePosition = fileIn;
	        }
	      } else {
	        fileIn.seek(start);
	        ebr = new EthereumBlockReader(fileIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	        filePosition = fileIn;
	      }
}

Example #15

Source File: DelimitedLineReader.java From tajo with Apache License 2.0

5 votes

public DelimitedLineReader(Configuration conf, final AbstractFileFragment fragment, int bufferSize)
    throws IOException {
  this.fragment = fragment;
  this.conf = conf;
  this.factory = new CompressionCodecFactory(conf);
  this.codec = factory.getCodec(fragment.getPath());
  this.bufferSize = bufferSize;
  if (this.codec instanceof SplittableCompressionCodec) {
    // bzip2 does not support multi-thread model
    throw new TajoRuntimeException(
        new NotImplementedException(this.getClass() + " does not support " + this.codec.getDefaultExtension()));
  }
}

Example #16

Source File: VCFInputFormat.java From Hadoop-BAM with MIT License

5 votes

@Override
protected boolean isSplitable(JobContext context, Path filename) {
	Configuration conf = context.getConfiguration();
	final CompressionCodec codec =
			new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
	if (codec == null) {
		return true;
	}
	if (codec instanceof BGZFCodec || codec instanceof BGZFEnhancedGzipCodec) {
		boolean splittable;
		try {
			try (FSDataInputStream in = filename.getFileSystem(conf).open(filename)) {
				splittable = BlockCompressedInputStream.isValidFile(new BufferedInputStream(in));
			}
		} catch (IOException e) {
			// can't determine if BGZF or GZIP, conservatively assume latter
			splittable = false;
		}
		if (!splittable) {
			logger.warn("{} is not splittable, consider using block-compressed gzip (BGZF)", filename);
		}
		return splittable;
	} else if (codec instanceof GzipCodec) {
		logger.warn("Using GzipCodec, which is not splittable, consider using block compressed gzip (BGZF) and BGZFCodec/BGZFEnhancedGzipCodec.");
	}
	return codec instanceof SplittableCompressionCodec;
}

Example #17

Source File: TextInputFormat.java From big-c with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #18

Source File: KeyValueTextInputFormat.java From big-c with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #19

Source File: CombineFileInputFormat.java From big-c with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #20

Source File: CombineFileInputFormat.java From big-c with Apache License 2.0

5 votes

protected boolean isSplitable(FileSystem fs, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(fs.getConf()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #21

Source File: KeyValueTextInputFormat.java From big-c with Apache License 2.0

5 votes

protected boolean isSplitable(FileSystem fs, Path file) {
  final CompressionCodec codec = compressionCodecs.getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #22

Source File: TextInputFormat.java From hadoop with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #23

Source File: KeyValueTextInputFormat.java From hadoop with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #24

Source File: CombineFileInputFormat.java From hadoop with Apache License 2.0

5 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #25

Source File: CombineFileInputFormat.java From hadoop with Apache License 2.0

5 votes

protected boolean isSplitable(FileSystem fs, Path file) {
  final CompressionCodec codec =
    new CompressionCodecFactory(fs.getConf()).getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #26

Source File: KeyValueTextInputFormat.java From hadoop with Apache License 2.0

5 votes

protected boolean isSplitable(FileSystem fs, Path file) {
  final CompressionCodec codec = compressionCodecs.getCodec(file);
  if (null == codec) {
    return true;
  }
  return codec instanceof SplittableCompressionCodec;
}

Example #27

Source File: ChunkRecordReader.java From pxf with Apache License 2.0

4 votes

/**
 * Constructs a ChunkRecordReader instance.
 *
 * @param job the job configuration
 * @param split contains the file name, begin byte of the split and the
 *            bytes length
 * @throws IOException if an I/O error occurs when accessing the file or
 *             creating input stream to read from it
 */
public ChunkRecordReader(Configuration job, FileSplit split)
        throws IOException, IncompatibleInputStreamException {
    maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    validateLength(maxLineLength);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // openForWrite the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file, ChunkReader.DEFAULT_BUFFER_SIZE);
    fileLength = getInputStream().getFileLength();
    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end,
                    SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new ChunkReader(cIn);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn; // take pos from compressed stream
        } else {
            in = new ChunkReader(codec.createInputStream(fileIn,
                    decompressor));
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new ChunkReader(fileIn);
        filePosition = fileIn;
    }
    /*
     * If this is not the first split, we always throw away first record
     * because we always (except the last split) read one extra line in
     * next() method.
     */
    if (start != 0) {
        start += in.readLine(new ChunkWritable(), maxBytesToConsume(start));
    }
    this.pos = start;
}

Example #28

Source File: ApacheHttpdLogfileInputFormat.java From logparser with Apache License 2.0

4 votes

@Override
protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec =
        new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return (null == codec) || codec instanceof SplittableCompressionCodec;
}

Example #29

Source File: AbstractBitcoinRecordReader.java From hadoopcryptoledger with Apache License 2.0

4 votes

/**
* Creates an Abstract Record Reader for Bitcoin blocks
* @param split Split to use (assumed to be a file split)
* @param job Configuration:
* io.file.buffer.size: Size of in-memory  specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize (maximum size of a bitcoin block) will be used. The configuration hadoopcryptoledger.bitcoinblockinputformat.filter.magic allows specifying the magic identifier of the block. The magic is a comma-separated list of Hex-values (e.g. F9BEB4D9,FABFB5DA,0B110907,0B110907). The default magic is always F9BEB4D9. One needs to specify at least one magic, otherwise it will be difficult to find blocks in splits. Furthermore, one may specify hadoopcryptoledger.bitcoinblockinputformat.maxblocksize, which defines the maximum size a bitcoin block may have. By default it is 8M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.bitcoinblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. 
* @param reporter Reporter
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws org.zuinnote.hadoop.bitcoin.format.exception.HadoopCryptoLedgerConfigurationException in case of an invalid HadoopCryptoLedger-specific configuration of the inputformat
* @throws org.zuinnote.hadoop.bitcoin.format.exception.BitcoinBlockReadException in case the Bitcoin data contains invalid blocks (e.g. magic might be different)
*
*/
public AbstractBitcoinRecordReader(FileSplit split,JobConf job, Reporter reporter) throws IOException,HadoopCryptoLedgerConfigurationException,BitcoinBlockReadException {
    LOG.debug("Reading configuration");
    // parse configuration
     this.reporter=reporter;
     this.conf=job;	
	this.maxSizeBitcoinBlock=conf.getInt(AbstractBitcoinRecordReader.CONF_MAXBLOCKSIZE,AbstractBitcoinRecordReader.DEFAULT_MAXSIZE_BITCOINBLOCK);
	this.bufferSize=conf.getInt(AbstractBitcoinRecordReader.CONF_BUFFERSIZE,AbstractBitcoinRecordReader.DEFAULT_BUFFERSIZE);
	this.specificMagic=conf.get(AbstractBitcoinRecordReader.CONF_FILTERMAGIC);
	// we need to provide at least 
	if ((this.specificMagic==null) || (this.specificMagic.length()==0)) {
		 this.specificMagic=AbstractBitcoinRecordReader.DEFAULT_MAGIC;
	}
	if ((this.specificMagic!=null) && (this.specificMagic.length()>0)) {
		this.specificMagicStringArray=specificMagic.split(",");
		specificMagicByteArray=new byte[specificMagicStringArray.length][4]; // each magic is always 4 byte
		for (int i=0;i<specificMagicStringArray.length;i++) {
				byte[] currentMagicNo=BitcoinUtil.convertHexStringToByteArray(specificMagicStringArray[i]);
				if (currentMagicNo.length!=4) {
					throw new HadoopCryptoLedgerConfigurationException("Error: Configuration. Magic number has not a length of 4 bytes. Index: "+i);
				}
				specificMagicByteArray[i]=currentMagicNo;
		}
	}	
	this.useDirectBuffer=conf.getBoolean(AbstractBitcoinRecordReader.CONF_USEDIRECTBUFFER,AbstractBitcoinRecordReader.DEFAULT_USEDIRECTBUFFER);
	this.readAuxPOW=conf.getBoolean(AbstractBitcoinRecordReader.CONF_READAUXPOW,AbstractBitcoinRecordReader.DEFAULT_READAUXPOW);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    codec = new CompressionCodecFactory(job).getCodec(file);
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
	LOG.debug("Decompressing file");
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		LOG.debug("SplittableCompressionCodec");
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
		bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);  
		start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	LOG.debug("Not-splitable compression codec");
	bbr = new BitcoinBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);
        filePosition = fileIn;
      }
    } else {
      LOG.debug("Processing file without compression");
      fileIn.seek(start);
      bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);  
      filePosition = fileIn;
    }
    // initialize reader
    // seek to block start (for the case a block overlaps a split)
    LOG.debug("Seeking to block start");
    this.reporter.setStatus("Seeking Block start");
    bbr.seekBlockStart();
    this.reporter.setStatus("Ready to read");
}

Example #30

Source File: AbstractEthereumRecordReader.java From hadoopcryptoledger with Apache License 2.0

4 votes

/**
* Creates an Abstract Record Reader for Ethereum blocks
* @param split Split to use (assumed to be a file split)
* @param job Configuration:
 * io.file.buffer.size: Size of in-memory  specified in the given Configuration. If io.file.buffer.size is not specified the default buffersize will be used. Furthermore, one may specify hadoopcryptoledger.ethereumblockinputformat.maxblocksize, which defines the maximum size a Ethereum block may have. By default it is 1M). If you want to experiment with performance using DirectByteBuffer instead of HeapByteBuffer you can use "hadoopcryptoledeger.ethereumblockinputformat.usedirectbuffer" (default: false). Note that it might have some unwanted consequences such as circumwenting Yarn memory management. The option is experimental and might be removed in future versions. 
* @param reporter Reporter
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
*
*/
public AbstractEthereumRecordReader(FileSplit split,JobConf job, Reporter reporter) throws IOException {
    LOG.debug("Reading configuration");
    // parse configuration
     this.reporter=reporter;
     this.conf=job;	
	this.maxSizeEthereumBlock=conf.getInt(AbstractEthereumRecordReader.CONF_MAXBLOCKSIZE,AbstractEthereumRecordReader.DEFAULT_MAXSIZE_ETHEREUMBLOCK);
	this.bufferSize=conf.getInt(AbstractEthereumRecordReader.CONF_BUFFERSIZE,AbstractEthereumRecordReader.DEFAULT_BUFFERSIZE);

	this.useDirectBuffer=conf.getBoolean(AbstractEthereumRecordReader.CONF_USEDIRECTBUFFER,AbstractEthereumRecordReader.DEFAULT_USEDIRECTBUFFER);
    // Initialize start and end of split
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    codec = new CompressionCodecFactory(job).getCodec(file);
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
	LOG.debug("Decompressing file");
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		LOG.debug("SplittableCompressionCodec");
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
        	ebr = new EthereumBlockReader(cIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
  			start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	LOG.debug("Not-splitable compression codec");
  	ebr = new EthereumBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
            filePosition = fileIn;
      }
    } else {
      LOG.debug("Processing file without compression");
      fileIn.seek(start);
      ebr = new EthereumBlockReader(fileIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	     filePosition = fileIn;
    }
    // initialize reader

    this.reporter.setStatus("Ready to read");
}