Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getStart()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit#getStart() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GryoRecordReader.java    From tinkerpop with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    final FileSplit split = (FileSplit) genericSplit;
    final Configuration configuration = context.getConfiguration();
    if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.gryoReader = GryoReader.build().mapper(
            GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create();
    long start = split.getStart();
    final Path file = split.getPath();
    if (null != new CompressionCodecFactory(configuration).getCodec(file)) {
        throw new IllegalStateException("Compression is not supported for the (binary) Gryo format");
    }
    // open the file and seek to the start of the split
    this.inputStream = file.getFileSystem(configuration).open(split.getPath());
    this.splitLength = split.getLength();
    if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start);
}
 
Example 2
Source File: XmlCollectionWithTagInputFormat.java    From vxquery with Apache License 2.0 6 votes vote down vote up
public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
    endTag = ENDING_TAG.getBytes(Charsets.UTF_8);
    startTag = STARTING_TAG.getBytes(Charsets.UTF_8);

    // open the file and seek to the start of the split
    start = split.getStart();
    // set the end of the file
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    FileStatus fStatus = fs.getFileStatus(file);
    blocks = fs.getFileBlockLocations(fStatus, 0, fStatus.getLen());
    // seek the start of file
    fsin = fs.open(split.getPath());
    fsin.seek(start);
}
 
Example 3
Source File: XMLLoader.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
    * Delegate the initialization method to the wrapped stream after changing
    * the length of the split to be non-ending.
    */
   @Override
   public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
     key = new LongWritable();
     value = new Text();
     if (split instanceof FileSplit) {
FileSplit fsplit = (FileSplit) split;
originalEnd = fsplit.getStart() + fsplit.getLength();
Path path = fsplit.getPath();
long fileEnd = path.getFileSystem(context.getConfiguration()).getFileStatus(path).getLen();
FileSplit extendedSplit = new FileSplit(path, fsplit.getStart(),
    Math.min(fsplit.getLength() * 10, fileEnd - fsplit.getStart()), fsplit.getLocations());
this.wrapped.initialize(extendedSplit, context);
     } else {
throw new RuntimeException("Cannot override a split of type'"+
    split.getClass()+"'");
     }
   }
 
Example 4
Source File: TestUniformSizeInputFormat.java    From big-c with Apache License 2.0 6 votes vote down vote up
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
  long lastEnd = 0;

  //Verify if each split's start is matching with the previous end and
  //we are not missing anything
  for (InputSplit split : splits) {
    FileSplit fileSplit = (FileSplit) split;
    long start = fileSplit.getStart();
    Assert.assertEquals(lastEnd, start);
    lastEnd = start + fileSplit.getLength();
  }

  //Verify there is nothing more to read from the input file
  SequenceFile.Reader reader
          = new SequenceFile.Reader(cluster.getFileSystem().getConf(),
                  SequenceFile.Reader.file(listFile));

  try {
    reader.seek(lastEnd);
    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
  } finally {
    IOUtils.closeStream(reader);
  }
}
 
Example 5
Source File: MultiLineInputFormat.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException
{
    numberOfLinesToProcess = getNumLinesPerSplit(context);
    FileSplit split = (FileSplit) genericSplit;
    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf
            .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();
    end = start + split.getLength();
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if (start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }
    in = new LineReader(filein, conf);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}
 
Example 6
Source File: XMLFileRecordReader.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
public XMLFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException
   {
   	FileSplit fileSplit = (FileSplit) split;	
       start = fileSplit.getStart();
       end = start + fileSplit.getLength();
       Path file = fileSplit.getPath();
       FileSystem fileSystem = file.getFileSystem(context.getConfiguration());
       fSDataInputStream = fileSystem.open(fileSplit.getPath());
       fSDataInputStream.seek(start);
}
 
Example 7
Source File: JsonFileRecordReader.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
public JsonFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{
	FileSplit fileSplit = (FileSplit) split;	
       start = fileSplit.getStart();
       end = start + fileSplit.getLength();
       Path file = fileSplit.getPath();
       FileSystem fileSystem = file.getFileSystem(context.getConfiguration());
       fSDataInputStream = fileSystem.open(fileSplit.getPath());
       fSDataInputStream.seek(start);
}
 
Example 8
Source File: JsonDataValidationMapper.java    From jumbune with GNU Lesser General Public License v3.0 5 votes vote down vote up
protected void setup(Mapper.Context context){		
	String jsonString = context.getConfiguration().get(JsonDataVaildationConstants.JSON_ARGUMENT);
	String regexString = context.getConfiguration().get(JsonDataVaildationConstants.REGEX_ARGUMENT);
	String nullString = context.getConfiguration().get(JsonDataVaildationConstants.NULL_ARGUMENT);
	tupleCounter = 0L;
	cleanTupleCounter =0L;
	recordsEmittByMap = 0L;
	//Populating JsonKey and Data type
	schema = getDatatypeExpression(jsonString);
	// Adding JsonKey given by user
	keylist = getKeyList(jsonString);

	if(!(regexString == null)){
		//Populating JsonKey and Regex
		regex = getExpression(regexString);
	}
	if(!(nullString == null)){
		//Populating JsonKey and NULLCONDITION
		nullMap = getExpression(nullString);
	}

	FileSplit fileSplit = (FileSplit)context.getInputSplit();
	splitStartOffset = fileSplit.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + fileSplit.getLength() - 1;
	filename = fileSplit.getPath().toUri().getPath();
	filename = filename.replaceAll(JsonDataVaildationConstants.FORWARD_SLASH, JsonDataVaildationConstants.JSON_DOT).substring(1, filename.length());
}
 
Example 9
Source File: PhoenixTextInputFormat.java    From phoenix with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
  rr.initialize(genericSplit, context);
  final Configuration conf = context.getConfiguration();
  final FileSplit split = (FileSplit) genericSplit;
  if (conf.getBoolean(SKIP_HEADER_KEY, false) && split.getStart() == 0) {
    LOGGER.trace("Consuming first key-value from {}", genericSplit);
    nextKeyValue();
  } else {
    LOGGER.trace("Not configured to skip header or not the first input split: {}", split);
  }
}
 
Example 10
Source File: JSONFileRecordReader.java    From ojai with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit arg0, TaskAttemptContext taskContext)
    throws IOException, InterruptedException {

  documentStream = null;
  it = null;
  documentCount = 0;
  key = new LongWritable();
  document = null;
  currentPos = 0;

  /* get the split */
  FileSplit split = (FileSplit) arg0;

  /* get configuration object */
  Configuration job = taskContext.getConfiguration();

  /* initialize file /input stream */
  final Path path = split.getPath();
  FileSystem fs = path.getFileSystem(job);
  inputStream = fs.open(path);

  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(path);

  if (codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
    inputStream = codec.createInputStream(inputStream, decompressor);
  }

  start = split.getStart();
  end = start + split.getLength();

  /* Initialize a stream reader so that it can read multiple documents from */
  /* the file */

  documentStream = (JsonDocumentStream)Json.newDocumentStream(inputStream);
  it = documentStream.iterator();

}
 
Example 11
Source File: AvroArrayReader.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(final InputSplit isplit, final TaskAttemptContext tc)
    throws IOException, InterruptedException {

  FileSplit fsplit = (FileSplit) isplit;
  start  = fsplit.getStart();
  end    = fsplit.getStart() + fsplit.getLength();
  DatumReader<GenericData.Array<Object>> datumReader
    = new GenericDatumReader<GenericData.Array<Object>>(schema);
  reader = DataFileReader.openReader(
      new FsInput(fsplit.getPath(), tc.getConfiguration()),
      datumReader);
  reader.sync(start);
}
 
Example 12
Source File: FastaInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();
	current_split_pos = 1;

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 13
Source File: TFileRecordReader.java    From tez with Apache License 2.0 5 votes vote down vote up
@Override public void initialize(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit fileSplit = (FileSplit) split;
  LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString());
  start = fileSplit.getStart();
  end = start + fileSplit.getLength();

  FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
  splitPath = fileSplit.getPath();
  fin = fs.open(splitPath);
  reader = new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(),
      context.getConfiguration());
  scanner = reader.createScannerByByteRange(start, fileSplit.getLength());
}
 
Example 14
Source File: AvroRecordReader.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration conf = context.getConfiguration();
  SeekableInput in = new FsInput(split.getPath(), conf);
  DatumReader<T> datumReader = new GenericDatumReader<T>();
  this.reader = DataFileReader.openReader(in, datumReader);
  reader.sync(split.getStart());                    // sync to start
  this.start = reader.tell();
  this.end = split.getStart() + split.getLength();
}
 
Example 15
Source File: UnenclosedBaseJsonRecordReader.java    From spatial-framework-for-hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
			throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit)split;
	start = fileSplit.getStart();
	end = fileSplit.getLength() + start;
	Path filePath = fileSplit.getPath();
       commonInit(filePath, taskContext.getConfiguration());
}
 
Example 16
Source File: DelimitedVectorInputFormat.java    From mrgeo with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException
{
  boolean useNLineFormat = context.getConfiguration().getBoolean(USE_NLINE_FORMAT, false);
  if (useNLineFormat)
  {
    List<InputSplit> splits = new NLineInputFormat().getSplits(context);
    // This is a workaround to what appears to be a bug in in how NLineInputFormat
    // computes its splits. When there are multiple splits in a file, it seems
    // the start position in the last split is off by one. Note that this corrective
    // code needs to check the last split for each different file that appears
    // in the list of splits.
    for (int index = 2; index < splits.size(); index++)
    {
      FileSplit previousSplit = (FileSplit) splits.get(index - 1);
      FileSplit currSplit = (FileSplit) splits.get(index);
      // If this index is the last split, or we've moved on to splits from a different
      // file, then we need to adjust the last split for that file.
      int lastFileIndex = -1;
      if (index == splits.size() - 1)
      {
        lastFileIndex = index;
      }
      else if (!currSplit.getPath().equals(previousSplit.getPath()))
      {
        lastFileIndex = index - 1;
      }
      if (lastFileIndex >= 2)
      {
        FileSplit lastFileSplit = (FileSplit) splits.get(lastFileIndex);
        FileSplit priorSplit = (FileSplit) splits.get(lastFileIndex - 1);
        if (lastFileSplit.getPath().equals(priorSplit.getPath()))
        {
          if (priorSplit.getPath().equals(lastFileSplit.getPath()) &&
              priorSplit.getStart() + priorSplit.getLength() < lastFileSplit.getStart())
          {
            // Adjust the start of previous split
            FileSplit replacement = new FileSplit(lastFileSplit.getPath(),
                priorSplit.getStart() + priorSplit.getLength(),
                lastFileSplit.getLength() + 1,
                lastFileSplit.getLocations());
            log.info("Replacing split: " + lastFileSplit);
            log.info("  With split: " + replacement);
            splits.set(lastFileIndex, replacement);
          }
        }
      }
    }
    return splits;
  }
  else
  {
    return new TextInputFormat().getSplits(context);
  }
}
 
Example 17
Source File: DataValidationMapper.java    From jumbune with GNU Lesser General Public License v3.0 4 votes vote down vote up
@SuppressWarnings("rawtypes")
protected void setup(Mapper.Context context) throws IOException, InterruptedException {
	lineNumber = 0;		
	recordsEmittByMap = 0l ;
	
	noOfToupleProcessd = 0l;
	cleanTupleCounter=0l;
	lineVWCounterNOF = 0;
	lineVWCounterNC = 0;
	lineVWCounterDT = 0;
	lineVWCounterRX = 0;
	dataValidatoinDiscripancies = new DataDiscrepanciesArrayWritable();
	dataViolationWBNOF = new DataViolationWB();
	dataViolationWBNC = new DataViolationWB();
	dataViolationWBDT = new DataViolationWB();
	dataViolationWBRX = new DataViolationWB();
	
	dataViolationWBArr = new DataViolationWB[1];
	
	// populating data validation parameters
	String dvBeanString = context.getConfiguration().get(DataValidationConstants.DATA_VALIDATION_BEAN_STRING);
	String validateMatrix = context.getConfiguration().get(DataValidationConstants.VALIDATE_MATRIX);
	Gson gson = new Gson();
	Type type = new TypeToken<DataValidationBean>() {
	}.getType();
	DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type);

	fieldSeparator = dataValidationBean.getFieldSeparator();
	fieldSeparator = fieldSeparator.replaceAll(Constants.SPACE_SEPARATOR, Constants.SPACE);
	fieldValidationList = dataValidationBean.getFieldValidationList();
	expectedNumOfFields = dataValidationBean.getNumOfFields();
	validateArray = gson.fromJson(validateMatrix, boolean[][].class);
	keyPattern = new LRUCache<String, Pattern>(expectedNumOfFields) {

		/**
		 * 
		 */
		private static final long serialVersionUID = 8594885637377460020L;

		@Override
		protected boolean removeEldestEntry(java.util.Map.Entry<String, Pattern> eldest) {
			if (size() > super.getCapacity()) {
				return true;
			}
			return false;
		}
	};
	

	FileSplit split = ((FileSplit) context.getInputSplit());		
	splitStartOffset = split.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + split.getLength() - 1;
	fileName = split.getPath().toUri().getPath();
	fileName = fileName.replaceAll("/", ".").substring(1, fileName.length());
}
 
Example 18
Source File: DelimitedTextInputFormat.java    From marklogic-contentpump with Apache License 2.0 4 votes vote down vote up
public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }

    if (splits.size()>= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:"
            + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter =0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file: splits) {
        FileSplit fsplit = ((FileSplit)file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);
        
        if (fsplit.getStart() == 0) {
        // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER,
                ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter
                    + ". Expects single character.");
            }
            String encoding = conf.get(
                MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream, CSVParserFormatter.
            		getFormat(delimiter, DelimitedTextReader.encapsulator,
            				true, true));
            Iterator<CSVRecord> it = parser.iterator();
            
            String[] header = null;
            if (it.hasNext()) {
            	CSVRecord record = (CSVRecord)it.next();
            	Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                	if (recordIterator.hasNext()) {
                		header[i] = (String)recordIterator.next();
                	} else {
                		throw new IOException("Record size doesn't match the real size");
                	}
                }
                
                EncodingUtil.handleBOMUTF8(header, 0);
                
                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }
        
        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(
            hlist.toArray(new Text[hlist.size()])), path,
            fsplit.getStart(), fsplit.getLength(),
            fsplit.getLocations());
        populatedSplits.add(ds);
    }
    
    return populatedSplits;
}
 
Example 19
Source File: BAMInputFormat.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
private int addProbabilisticSplits(
		List<InputSplit> splits, int i, List<InputSplit> newSplits,
		Configuration cfg)
	throws IOException
{
	final Path path = ((FileSplit)splits.get(i)).getPath();
       try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {

           final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

           FileVirtualSplit previousSplit = null;

           for (; i < splits.size(); ++i) {
               FileSplit fspl = (FileSplit)splits.get(i);
               if (!fspl.getPath().equals(path))
                   break;

               long beg =       fspl.getStart();
               long end = beg + fspl.getLength();

               long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

               // As the guesser goes to the next BGZF block before looking for BAM
               // records, the ending BGZF blocks have to always be traversed fully.
               // Hence force the length to be 0xffff, the maximum possible.
               long alignedEnd = end << 16 | 0xffff;

               if (alignedBeg == end) {
                   // No records detected in this split: merge it to the previous one.
                   // This could legitimately happen e.g. if we have a split that is
                   // so small that it only contains the middle part of a BGZF block.
                   //
                   // Of course, if it's the first split, then this is simply not a
                   // valid BAM file.
                   //
                   // FIXME: In theory, any number of splits could only contain parts
                   // of the BAM header before we start to see splits that contain BAM
                   // records. For now, we require that the split size is at least as
                   // big as the header and don't handle that case.
                   if (previousSplit == null)
                       throw new IOException("'" + path + "': "+
                           "no reads in first split: bad BAM file or tiny split size?");

                   previousSplit.setEndVirtualOffset(alignedEnd);
               } else {
                   previousSplit = new FileVirtualSplit(
                                           path, alignedBeg, alignedEnd, fspl.getLocations());
                   if (logger.isDebugEnabled()) {
                       final long byteOffset  = alignedBeg >>> 16;
                       final long recordOffset = alignedBeg & 0xffff;
                       logger.debug(
                           "Split {}: byte offset: {} record offset: {}, virtual offset: {}",
                           i, byteOffset, recordOffset, alignedBeg);
                   }
                   newSplits.add(previousSplit);
               }
           }
       }
       return i;
}
 
Example 20
Source File: ParquetInputSplit.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
  return new ParquetInputSplit(split.getPath(),
      split.getStart(), split.getStart() + split.getLength(),
      split.getLength(), split.getLocations(), null);
}