org.apache.hadoop.mapreduce.lib.input.FileSplit#getLength

Source File: RFileRecordReader.java From datawave with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    
    // Note that the RFileInputFormat returns false for "isSplittable", so this should ALWAYS be 0
    start = fileSplit.getStart();
    if (start != 0) {
        throw new IOException("Cannot start reading an RFile in the middle: start=" + start);
    }
    end = fileSplit.getLength() - start;
    pos = start;
    
    FileOperations ops = RFileOperations.getInstance();
    String file = fileSplit.getPath().toString();
    FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
    fileIterator = ops.newReaderBuilder().forFile(file, fs, context.getConfiguration())
                    .withTableConfiguration(AccumuloConfiguration.getDefaultConfiguration()).seekToBeginning().build();
}

Source File: MsgpackRecordReader.java From laser with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit) split;
	String host = fileSplit.getPath().toString();
	Configuration conf = context.getConfiguration();
	String collection = conf.get("com.b5m.laser.collection");
	int port = conf.getInt("com.b5m.laser.msgpack.port", 0);
	method = conf.get("com.b5m.laser.msgpack.input.method");
	client = new MsgpackClient(host, port, collection);
	start = fileSplit.getStart();
	splitLenth = fileSplit.getLength();
	readLength = 0;
	vClass = conf.getClass("com.b5m.laser.msgpack.input.value.class", null);
	try {
		Object[] req = new Object[1];
		req[0] = start;
		client.writeIgnoreRetValue(req, method + "|start");
	} catch (Exception e) {
		throw new IOException(e.getLocalizedMessage());
	}
}

Source File: InterRecordReader.java From spork with Apache License 2.0

6 votes

public void initialize(InputSplit genericSplit,
                       TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  if (start != 0) {
      fileIn.seek(start);
  }
  in = new BufferedPositionedInputStream(fileIn, start);
  inData = new DataInputStream(in);
}

Source File: SSTableSplitRecordReader.java From hadoop-sstable with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException {

    this.context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    this.sstableFile = fileSplit.getPath();
    // The SSTableSplitInputFormat is not splittable, so the split length is the whole file.
    this.totalFileSize = fileSplit.getLength();


    Configuration conf = context.getConfiguration();
    FileSystem fs = sstableFile.getFileSystem(conf);
    this.rawInputStream = fs.open(sstableFile);

    this.splitSize = conf.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB,
            HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024;

    this.index = new IndexOffsetScanner(sstableFile, fs);
}

Source File: IndexedStorage.java From spork with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {

    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MRConfiguration.LINERECORDREADER_MAXLENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (start != 0) {
        skipFirstLine = true;
        --start;
        fileIn.seek(start);
    }
    in = new IndexedStorageLineReader(fileIn, job);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

Source File: JsonDataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0

5 votes

protected void setup(Mapper.Context context){		
	String jsonString = context.getConfiguration().get(JsonDataVaildationConstants.JSON_ARGUMENT);
	String regexString = context.getConfiguration().get(JsonDataVaildationConstants.REGEX_ARGUMENT);
	String nullString = context.getConfiguration().get(JsonDataVaildationConstants.NULL_ARGUMENT);
	tupleCounter = 0L;
	cleanTupleCounter =0L;
	recordsEmittByMap = 0L;
	//Populating JsonKey and Data type
	schema = getDatatypeExpression(jsonString);
	// Adding JsonKey given by user
	keylist = getKeyList(jsonString);

	if(!(regexString == null)){
		//Populating JsonKey and Regex
		regex = getExpression(regexString);
	}
	if(!(nullString == null)){
		//Populating JsonKey and NULLCONDITION
		nullMap = getExpression(nullString);
	}

	FileSplit fileSplit = (FileSplit)context.getInputSplit();
	splitStartOffset = fileSplit.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + fileSplit.getLength() - 1;
	filename = fileSplit.getPath().toUri().getPath();
	filename = filename.replaceAll(JsonDataVaildationConstants.FORWARD_SLASH, JsonDataVaildationConstants.JSON_DOT).substring(1, filename.length());
}

Source File: FastaInputFormat.java From Hadoop-BAM with MIT License

5 votes

public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();
	current_split_pos = 1;

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}

Source File: TFileRecordReader.java From spork with Apache License 2.0

5 votes

public void initialize(InputSplit genericSplit, TaskAttemptContext context)
                throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(split.getPath());
    reader = new Reader(fileIn, fs.getFileStatus(file).getLen(), job);
    scanner = reader.createScannerByByteRange(start, split.getLength());
}

Source File: EnclosedBaseJsonRecordReader.java From spatial-framework-for-hadoop with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
			throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit)split;
       splitLen = fileSplit.getLength();  // using MRv2
       commonInit(fileSplit.getPath(), taskContext.getConfiguration());
}

Source File: XmlInputFormat.java From hiped2 with Apache License 2.0

5 votes

public XmlRecordReader(FileSplit split, Configuration conf)
    throws IOException {
  startTag = conf.get(START_TAG_KEY).getBytes("UTF-8");
  endTag = conf.get(END_TAG_KEY).getBytes("UTF-8");

  // open the file and seek to the start of the split
  start = split.getStart();
  end = start + split.getLength();
  Path file = split.getPath();
  FileSystem fs = file.getFileSystem(conf);
  fsin = fs.open(split.getPath());
  fsin.seek(start);
}

Source File: MultiLineInputFormat.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException
{
    numberOfLinesToProcess = getNumLinesPerSplit(context);
    FileSplit split = (FileSplit) genericSplit;
    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf
            .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();
    end = start + split.getLength();
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if (start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }
    in = new LineReader(filein, conf);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

Source File: ResourceRecordReader.java From webarchive-commons with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
		throws IOException, InterruptedException {
	if(inputSplit instanceof FileSplit) {
		FileSplit fs = (FileSplit) inputSplit;
		Path fsPath = fs.getPath();
    	FileSystem fSys = fsPath.getFileSystem(context.getConfiguration());
    	FSDataInputStream fsdis = fSys.open(fsPath);
    	String path = fsPath.getName();
    	name = fsPath.getName();
    	stream = new HDFSStream(fsdis);
    	startOffset = fs.getStart();
		length = fs.getLength();
		long endOffset = startOffset + length;
		stream.setOffset(startOffset);	    	
    	series = new GZIPMemberSeries(stream, name, startOffset);
		GZIPResourceContainer prod = 
			new GZIPResourceContainer(series,endOffset);
		ResourceProducer envelope;
    	if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
    		envelope = new TransformingResourceProducer(prod,wf);
		} else if(path.endsWith(".arc.gz")) {
			envelope = new TransformingResourceProducer(prod,af);
		} else {
			throw new IOException("arguments must be arc.gz or warc.gz");
		}
    	ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
    	producer = new ExtractingResourceProducer(envelope, mapper);

	} else {
		throw new IOException("Need FileSplit input...");
	}
}

Source File: JsonFileRecordReader.java From jumbune with GNU Lesser General Public License v3.0

5 votes

public JsonFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{
	FileSplit fileSplit = (FileSplit) split;	
       start = fileSplit.getStart();
       end = start + fileSplit.getLength();
       Path file = fileSplit.getPath();
       FileSystem fileSystem = file.getFileSystem(context.getConfiguration());
       fSDataInputStream = fileSystem.open(fileSplit.getPath());
       fSDataInputStream.seek(start);
}

Source File: TFileRecordReader.java From tez with Apache License 2.0

5 votes

@Override public void initialize(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit fileSplit = (FileSplit) split;
  LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString());
  start = fileSplit.getStart();
  end = start + fileSplit.getLength();

  FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
  splitPath = fileSplit.getPath();
  fin = fs.open(splitPath);
  reader = new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(),
      context.getConfiguration());
  scanner = reader.createScannerByByteRange(start, fileSplit.getLength());
}

Source File: AvroArrayReader.java From spork with Apache License 2.0

5 votes

@Override
public void initialize(final InputSplit isplit, final TaskAttemptContext tc)
    throws IOException, InterruptedException {

  FileSplit fsplit = (FileSplit) isplit;
  start  = fsplit.getStart();
  end    = fsplit.getStart() + fsplit.getLength();
  DatumReader<GenericData.Array<Object>> datumReader
    = new GenericDatumReader<GenericData.Array<Object>>(schema);
  reader = DataFileReader.openReader(
      new FsInput(fsplit.getPath(), tc.getConfiguration()),
      datumReader);
  reader.sync(start);
}

Source File: QseqInputFormat.java From Hadoop-BAM with MIT License

5 votes

public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}

Source File: FastqInputFormat.java From Hadoop-BAM with MIT License

5 votes

public FastqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}

Source File: JsonRecordReader.java From datawave with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
    
    super.initialize(split, context);
    
    if (!(split instanceof FileSplit)) {
        throw new IOException("Cannot handle split type " + split.getClass().getName());
    }
    
    FileSplit fsplit = (FileSplit) split;
    Path file = fsplit.getPath();
    rawFileName = file.getName();
    fileURI = file.toUri();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    InputStream is = fs.open(file);
    start = fsplit.getStart();
    end = start + fsplit.getLength();
    pos = start;
    
    String normURI = fileURI.getScheme() + "://" + fileURI.getPath();
    
    setupReader(is);
    
    if (logger.isInfoEnabled()) {
        logger.info("Reading Json records from " + normURI + " via " + is.getClass().getName());
    }
    
    jsonHelper = (JsonDataTypeHelper) createHelper(context.getConfiguration());
    this.parseHeaderOnly = !jsonHelper.processExtraFields();
    jsonFlattener = jsonHelper.newFlattener();
    
    if (logger.isInfoEnabled()) {
        logger.info("Json flattener mode: " + jsonFlattener.getFlattenMode().name());
    }
}

Source File: DataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0

4 votes

@SuppressWarnings("rawtypes")
protected void setup(Mapper.Context context) throws IOException, InterruptedException {
	lineNumber = 0;		
	recordsEmittByMap = 0l ;
	
	noOfToupleProcessd = 0l;
	cleanTupleCounter=0l;
	lineVWCounterNOF = 0;
	lineVWCounterNC = 0;
	lineVWCounterDT = 0;
	lineVWCounterRX = 0;
	dataValidatoinDiscripancies = new DataDiscrepanciesArrayWritable();
	dataViolationWBNOF = new DataViolationWB();
	dataViolationWBNC = new DataViolationWB();
	dataViolationWBDT = new DataViolationWB();
	dataViolationWBRX = new DataViolationWB();
	
	dataViolationWBArr = new DataViolationWB[1];
	
	// populating data validation parameters
	String dvBeanString = context.getConfiguration().get(DataValidationConstants.DATA_VALIDATION_BEAN_STRING);
	String validateMatrix = context.getConfiguration().get(DataValidationConstants.VALIDATE_MATRIX);
	Gson gson = new Gson();
	Type type = new TypeToken<DataValidationBean>() {
	}.getType();
	DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type);

	fieldSeparator = dataValidationBean.getFieldSeparator();
	fieldSeparator = fieldSeparator.replaceAll(Constants.SPACE_SEPARATOR, Constants.SPACE);
	fieldValidationList = dataValidationBean.getFieldValidationList();
	expectedNumOfFields = dataValidationBean.getNumOfFields();
	validateArray = gson.fromJson(validateMatrix, boolean[][].class);
	keyPattern = new LRUCache<String, Pattern>(expectedNumOfFields) {

		/**
		 * 
		 */
		private static final long serialVersionUID = 8594885637377460020L;

		@Override
		protected boolean removeEldestEntry(java.util.Map.Entry<String, Pattern> eldest) {
			if (size() > super.getCapacity()) {
				return true;
			}
			return false;
		}
	};
	

	FileSplit split = ((FileSplit) context.getInputSplit());		
	splitStartOffset = split.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + split.getLength() - 1;
	fileName = split.getPath().toUri().getPath();
	fileName = fileName.replaceAll("/", ".").substring(1, fileName.length());
}

Source File: LongLineEventRecordReader.java From datawave with Apache License 2.0

4 votes

/**
 * @param genericSplit
 * @param context
 * @throws IOException
 */
public void initializeLineReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LfLineReader(codec.createInputStream(fileIn), job);
        in.setNewLineIncluded(newLineIncluded);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        
        // Hadoop CodecFactory only checks the file suffix, let's double check for gzip since some data producers
        // may not append .gz to their files.
        InputStream iStream = GzipDetectionUtil.decompressTream(fileIn);
        Class streamClass = iStream.getClass();
        if (GZIPInputStream.class == streamClass) {
            end = Long.MAX_VALUE;
        }
        
        in = new LfLineReader(iStream, job);
        in.setNewLineIncluded(newLineIncluded);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getLength()