Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getLength()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit#getLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RFileRecordReader.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    
    // Note that the RFileInputFormat returns false for "isSplittable", so this should ALWAYS be 0
    start = fileSplit.getStart();
    if (start != 0) {
        throw new IOException("Cannot start reading an RFile in the middle: start=" + start);
    }
    end = fileSplit.getLength() - start;
    pos = start;
    
    FileOperations ops = RFileOperations.getInstance();
    String file = fileSplit.getPath().toString();
    FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
    fileIterator = ops.newReaderBuilder().forFile(file, fs, context.getConfiguration())
                    .withTableConfiguration(AccumuloConfiguration.getDefaultConfiguration()).seekToBeginning().build();
}
 
Example 2
Source File: MsgpackRecordReader.java    From laser with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit) split;
	String host = fileSplit.getPath().toString();
	Configuration conf = context.getConfiguration();
	String collection = conf.get("com.b5m.laser.collection");
	int port = conf.getInt("com.b5m.laser.msgpack.port", 0);
	method = conf.get("com.b5m.laser.msgpack.input.method");
	client = new MsgpackClient(host, port, collection);
	start = fileSplit.getStart();
	splitLenth = fileSplit.getLength();
	readLength = 0;
	vClass = conf.getClass("com.b5m.laser.msgpack.input.value.class", null);
	try {
		Object[] req = new Object[1];
		req[0] = start;
		client.writeIgnoreRetValue(req, method + "|start");
	} catch (Exception e) {
		throw new IOException(e.getLocalizedMessage());
	}
}
 
Example 3
Source File: InterRecordReader.java    From spork with Apache License 2.0 6 votes vote down vote up
public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  if (start != 0) {
      fileIn.seek(start);
  }
  in = new BufferedPositionedInputStream(fileIn, start);
  inData = new DataInputStream(in);
}
 
Example 4
Source File: SSTableSplitRecordReader.java    From hadoop-sstable with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException {

    this.context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    this.sstableFile = fileSplit.getPath();
    // The SSTableSplitInputFormat is not splittable, so the split length is the whole file.
    this.totalFileSize = fileSplit.getLength();


    Configuration conf = context.getConfiguration();
    FileSystem fs = sstableFile.getFileSystem(conf);
    this.rawInputStream = fs.open(sstableFile);

    this.splitSize = conf.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB,
            HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024;

    this.index = new IndexOffsetScanner(sstableFile, fs);
}
 
Example 5
Source File: IndexedStorage.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MRConfiguration.LINERECORDREADER_MAXLENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (start != 0) {
        skipFirstLine = true;
        --start;
        fileIn.seek(start);
    }
    in = new IndexedStorageLineReader(fileIn, job);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}
 
Example 6
Source File: JsonDataValidationMapper.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected void setup(Mapper.Context context){		
	String jsonString = context.getConfiguration().get(JsonDataVaildationConstants.JSON_ARGUMENT);
	String regexString = context.getConfiguration().get(JsonDataVaildationConstants.REGEX_ARGUMENT);
	String nullString = context.getConfiguration().get(JsonDataVaildationConstants.NULL_ARGUMENT);
	tupleCounter = 0L;
	cleanTupleCounter =0L;
	recordsEmittByMap = 0L;
	//Populating JsonKey and Data type
	schema = getDatatypeExpression(jsonString);
	// Adding JsonKey given by user
	keylist = getKeyList(jsonString);

	if(!(regexString == null)){
		//Populating JsonKey and Regex
		regex = getExpression(regexString);
	}
	if(!(nullString == null)){
		//Populating JsonKey and NULLCONDITION
		nullMap = getExpression(nullString);
	}

	FileSplit fileSplit = (FileSplit)context.getInputSplit();
	splitStartOffset = fileSplit.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + fileSplit.getLength() - 1;
	filename = fileSplit.getPath().toUri().getPath();
	filename = filename.replaceAll(JsonDataVaildationConstants.FORWARD_SLASH, JsonDataVaildationConstants.JSON_DOT).substring(1, filename.length());
}
 
Example 7
Source File: FastaInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();
	current_split_pos = 1;

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 8
Source File: TFileRecordReader.java    From spork with Apache License 2.0 5 votes vote down vote up
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
                throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    reader = new Reader(fileIn, fs.getFileStatus(file).getLen(), job);
    scanner = reader.createScannerByByteRange(start, split.getLength());
}
 
Example 9
Source File: EnclosedBaseJsonRecordReader.java    From spatial-framework-for-hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
			throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit)split;
       splitLen = fileSplit.getLength();  // using MRv2
       commonInit(fileSplit.getPath(), taskContext.getConfiguration());
}
 
Example 10
Source File: XmlInputFormat.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public XmlRecordReader(FileSplit split, Configuration conf)
    throws IOException {
  startTag = conf.get(START_TAG_KEY).getBytes("UTF-8");
  endTag = conf.get(END_TAG_KEY).getBytes("UTF-8");

  // open the file and seek to the start of the split
  start = split.getStart();
  end = start + split.getLength();
  Path file = split.getPath();
  FileSystem fs = file.getFileSystem(conf);
  fsin = fs.open(split.getPath());
  fsin.seek(start);
}
 
Example 11
Source File: MultiLineInputFormat.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException
{
    numberOfLinesToProcess = getNumLinesPerSplit(context);
    FileSplit split = (FileSplit) genericSplit;
    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf
            .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();
    end = start + split.getLength();
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if (start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }
    in = new LineReader(filein, conf);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}
 
Example 12
Source File: ResourceRecordReader.java    From webarchive-commons with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
		throws IOException, InterruptedException {
	if(inputSplit instanceof FileSplit) {
		FileSplit fs = (FileSplit) inputSplit;
		Path fsPath = fs.getPath();
    	FileSystem fSys = fsPath.getFileSystem(context.getConfiguration());
    	FSDataInputStream fsdis = fSys.open(fsPath);
    	String path = fsPath.getName();
    	name = fsPath.getName();
    	stream = new HDFSStream(fsdis);
    	startOffset = fs.getStart();
		length = fs.getLength();
		long endOffset = startOffset + length;
		stream.setOffset(startOffset);	    	
    	series = new GZIPMemberSeries(stream, name, startOffset);
		GZIPResourceContainer prod = 
			new GZIPResourceContainer(series,endOffset);
		ResourceProducer envelope;
    	if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
    		envelope = new TransformingResourceProducer(prod,wf);
		} else if(path.endsWith(".arc.gz")) {
			envelope = new TransformingResourceProducer(prod,af);
		} else {
			throw new IOException("arguments must be arc.gz or warc.gz");
		}
    	ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
    	producer = new ExtractingResourceProducer(envelope, mapper);

	} else {
		throw new IOException("Need FileSplit input...");
	}
}
 
Example 13
Source File: JsonFileRecordReader.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
public JsonFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{
	FileSplit fileSplit = (FileSplit) split;	
       start = fileSplit.getStart();
       end = start + fileSplit.getLength();
       Path file = fileSplit.getPath();
       FileSystem fileSystem = file.getFileSystem(context.getConfiguration());
       fSDataInputStream = fileSystem.open(fileSplit.getPath());
       fSDataInputStream.seek(start);
}
 
Example 14
Source File: TFileRecordReader.java    From tez with Apache License 2.0 5 votes vote down vote up
@Override public void initialize(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit fileSplit = (FileSplit) split;
  LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString());
  start = fileSplit.getStart();
  end = start + fileSplit.getLength();

  FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
  splitPath = fileSplit.getPath();
  fin = fs.open(splitPath);
  reader = new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(),
      context.getConfiguration());
  scanner = reader.createScannerByByteRange(start, fileSplit.getLength());
}
 
Example 15
Source File: AvroArrayReader.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(final InputSplit isplit, final TaskAttemptContext tc)
    throws IOException, InterruptedException {

  FileSplit fsplit = (FileSplit) isplit;
  start  = fsplit.getStart();
  end    = fsplit.getStart() + fsplit.getLength();
  DatumReader<GenericData.Array<Object>> datumReader
    = new GenericDatumReader<GenericData.Array<Object>>(schema);
  reader = DataFileReader.openReader(
      new FsInput(fsplit.getPath(), tc.getConfiguration()),
      datumReader);
  reader.sync(start);
}
 
Example 16
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 17
Source File: FastqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 18
Source File: JsonRecordReader.java    From datawave with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    
    super.initialize(split, context);
    
    if (!(split instanceof FileSplit)) {
        throw new IOException("Cannot handle split type " + split.getClass().getName());
    }
    
    FileSplit fsplit = (FileSplit) split;
    Path file = fsplit.getPath();
    rawFileName = file.getName();
    fileURI = file.toUri();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    InputStream is = fs.open(file);
    start = fsplit.getStart();
    end = start + fsplit.getLength();
    pos = start;
    
    String normURI = fileURI.getScheme() + "://" + fileURI.getPath();
    
    setupReader(is);
    
    if (logger.isInfoEnabled()) {
        logger.info("Reading Json records from " + normURI + " via " + is.getClass().getName());
    }
    
    jsonHelper = (JsonDataTypeHelper) createHelper(context.getConfiguration());
    this.parseHeaderOnly = !jsonHelper.processExtraFields();
    jsonFlattener = jsonHelper.newFlattener();
    
    if (logger.isInfoEnabled()) {
        logger.info("Json flattener mode: " + jsonFlattener.getFlattenMode().name());
    }
}
 
Example 19
Source File: DataValidationMapper.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("rawtypes")
protected void setup(Mapper.Context context) throws IOException, InterruptedException {
	lineNumber = 0;		
	recordsEmittByMap = 0l ;
	
	noOfToupleProcessd = 0l;
	cleanTupleCounter=0l;
	lineVWCounterNOF = 0;
	lineVWCounterNC = 0;
	lineVWCounterDT = 0;
	lineVWCounterRX = 0;
	dataValidatoinDiscripancies = new DataDiscrepanciesArrayWritable();
	dataViolationWBNOF = new DataViolationWB();
	dataViolationWBNC = new DataViolationWB();
	dataViolationWBDT = new DataViolationWB();
	dataViolationWBRX = new DataViolationWB();
	
	dataViolationWBArr = new DataViolationWB[1];
	
	// populating data validation parameters
	String dvBeanString = context.getConfiguration().get(DataValidationConstants.DATA_VALIDATION_BEAN_STRING);
	String validateMatrix = context.getConfiguration().get(DataValidationConstants.VALIDATE_MATRIX);
	Gson gson = new Gson();
	Type type = new TypeToken<DataValidationBean>() {
	}.getType();
	DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type);

	fieldSeparator = dataValidationBean.getFieldSeparator();
	fieldSeparator = fieldSeparator.replaceAll(Constants.SPACE_SEPARATOR, Constants.SPACE);
	fieldValidationList = dataValidationBean.getFieldValidationList();
	expectedNumOfFields = dataValidationBean.getNumOfFields();
	validateArray = gson.fromJson(validateMatrix, boolean[][].class);
	keyPattern = new LRUCache<String, Pattern>(expectedNumOfFields) {

		/**
		 * 
		 */
		private static final long serialVersionUID = 8594885637377460020L;

		@Override
		protected boolean removeEldestEntry(java.util.Map.Entry<String, Pattern> eldest) {
			if (size() > super.getCapacity()) {
				return true;
			}
			return false;
		}
	};
	

	FileSplit split = ((FileSplit) context.getInputSplit());		
	splitStartOffset = split.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + split.getLength() - 1;
	fileName = split.getPath().toUri().getPath();
	fileName = fileName.replaceAll("/", ".").substring(1, fileName.length());
}
 
Example 20
Source File: LongLineEventRecordReader.java    From datawave with Apache License 2.0 4 votes vote down vote up
/**
 * @param genericSplit
 * @param context
 * @throws IOException
 */
public void initializeLineReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LfLineReader(codec.createInputStream(fileIn), job);
        in.setNewLineIncluded(newLineIncluded);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        
        // Hadoop CodecFactory only checks the file suffix, let's double check for gzip since some data producers
        // may not append .gz to their files.
        InputStream iStream = GzipDetectionUtil.decompressTream(fileIn);
        Class streamClass = iStream.getClass();
        if (GZIPInputStream.class == streamClass) {
            end = Long.MAX_VALUE;
        }
        
        in = new LfLineReader(iStream, job);
        in.setNewLineIncluded(newLineIncluded);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}