Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getLocations()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit#getLocations() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TeraScheduler.java    From hadoop with Apache License 2.0 6 votes vote down vote up
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}
 
Example 2
Source File: TeraScheduler.java    From pravega-samples with Apache License 2.0 6 votes vote down vote up
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}
 
Example 3
Source File: TeraScheduler.java    From big-c with Apache License 2.0 6 votes vote down vote up
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}
 
Example 4
Source File: CRAMInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
public List<InputSplit> getSplits(List<InputSplit> splits, Configuration conf)
    throws IOException {
  // update splits to align with CRAM container boundaries
  List<InputSplit> newSplits = new ArrayList<InputSplit>();
  Map<Path, List<Long>> fileToOffsets = new HashMap<Path, List<Long>>();
  for (InputSplit split : splits) {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();
    List<Long> containerOffsets = fileToOffsets.get(path);
    if (containerOffsets == null) {
      containerOffsets = getContainerOffsets(conf, path);
      fileToOffsets.put(path, containerOffsets);
    }
    long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart());
    long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() +
        fileSplit.getLength());
    long newLength = newEnd - newStart;
    if (newLength == 0) { // split is wholly within a container
      continue;
    }
    FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength,
        fileSplit.getLocations());
    newSplits.add(newSplit);
  }
  return newSplits;
}
 
Example 5
Source File: XMLLoader.java    From spork with Apache License 2.0 6 votes vote down vote up
/**
    * Delegate the initialization method to the wrapped stream after changing
    * the length of the split to be non-ending.
    */
   @Override
   public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
     key = new LongWritable();
     value = new Text();
     if (split instanceof FileSplit) {
FileSplit fsplit = (FileSplit) split;
originalEnd = fsplit.getStart() + fsplit.getLength();
Path path = fsplit.getPath();
long fileEnd = path.getFileSystem(context.getConfiguration()).getFileStatus(path).getLen();
FileSplit extendedSplit = new FileSplit(path, fsplit.getStart(),
    Math.min(fsplit.getLength() * 10, fileEnd - fsplit.getStart()), fsplit.getLocations());
this.wrapped.initialize(extendedSplit, context);
     } else {
throw new RuntimeException("Cannot override a split of type'"+
    split.getClass()+"'");
     }
   }
 
Example 6
Source File: TeraScheduler.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}
 
Example 7
Source File: CombineDocumentSplit.java    From marklogic-contentpump with Apache License 2.0 5 votes vote down vote up
public void addSplit(FileSplit split) 
throws IOException, InterruptedException {
    splits.add(split);
    length += split.getLength();
    for (String loc : split.getLocations()) {
        if (!locations.contains(loc)) {
            locations.add(loc);
        }
    }
}
 
Example 8
Source File: DelimitedTextInputFormat.java    From marklogic-contentpump with Apache License 2.0 4 votes vote down vote up
public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }

    if (splits.size()>= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:"
            + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter =0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file: splits) {
        FileSplit fsplit = ((FileSplit)file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);
        
        if (fsplit.getStart() == 0) {
        // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER,
                ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter
                    + ". Expects single character.");
            }
            String encoding = conf.get(
                MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream, CSVParserFormatter.
            		getFormat(delimiter, DelimitedTextReader.encapsulator,
            				true, true));
            Iterator<CSVRecord> it = parser.iterator();
            
            String[] header = null;
            if (it.hasNext()) {
            	CSVRecord record = (CSVRecord)it.next();
            	Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                	if (recordIterator.hasNext()) {
                		header[i] = (String)recordIterator.next();
                	} else {
                		throw new IOException("Record size doesn't match the real size");
                	}
                }
                
                EncodingUtil.handleBOMUTF8(header, 0);
                
                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }
        
        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(
            hlist.toArray(new Text[hlist.size()])), path,
            fsplit.getStart(), fsplit.getLength(),
            fsplit.getLocations());
        populatedSplits.add(ds);
    }
    
    return populatedSplits;
}
 
Example 9
Source File: BAMInputFormat.java    From Hadoop-BAM with MIT License 4 votes vote down vote up
private int addProbabilisticSplits(
		List<InputSplit> splits, int i, List<InputSplit> newSplits,
		Configuration cfg)
	throws IOException
{
	final Path path = ((FileSplit)splits.get(i)).getPath();
       try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {

           final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

           FileVirtualSplit previousSplit = null;

           for (; i < splits.size(); ++i) {
               FileSplit fspl = (FileSplit)splits.get(i);
               if (!fspl.getPath().equals(path))
                   break;

               long beg =       fspl.getStart();
               long end = beg + fspl.getLength();

               long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

               // As the guesser goes to the next BGZF block before looking for BAM
               // records, the ending BGZF blocks have to always be traversed fully.
               // Hence force the length to be 0xffff, the maximum possible.
               long alignedEnd = end << 16 | 0xffff;

               if (alignedBeg == end) {
                   // No records detected in this split: merge it to the previous one.
                   // This could legitimately happen e.g. if we have a split that is
                   // so small that it only contains the middle part of a BGZF block.
                   //
                   // Of course, if it's the first split, then this is simply not a
                   // valid BAM file.
                   //
                   // FIXME: In theory, any number of splits could only contain parts
                   // of the BAM header before we start to see splits that contain BAM
                   // records. For now, we require that the split size is at least as
                   // big as the header and don't handle that case.
                   if (previousSplit == null)
                       throw new IOException("'" + path + "': "+
                           "no reads in first split: bad BAM file or tiny split size?");

                   previousSplit.setEndVirtualOffset(alignedEnd);
               } else {
                   previousSplit = new FileVirtualSplit(
                                           path, alignedBeg, alignedEnd, fspl.getLocations());
                   if (logger.isDebugEnabled()) {
                       final long byteOffset  = alignedBeg >>> 16;
                       final long recordOffset = alignedBeg & 0xffff;
                       logger.debug(
                           "Split {}: byte offset: {} record offset: {}, virtual offset: {}",
                           i, byteOffset, recordOffset, alignedBeg);
                   }
                   newSplits.add(previousSplit);
               }
           }
       }
       return i;
}
 
Example 10
Source File: DelimitedVectorInputFormat.java    From mrgeo with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException
{
  boolean useNLineFormat = context.getConfiguration().getBoolean(USE_NLINE_FORMAT, false);
  if (useNLineFormat)
  {
    List<InputSplit> splits = new NLineInputFormat().getSplits(context);
    // This is a workaround to what appears to be a bug in in how NLineInputFormat
    // computes its splits. When there are multiple splits in a file, it seems
    // the start position in the last split is off by one. Note that this corrective
    // code needs to check the last split for each different file that appears
    // in the list of splits.
    for (int index = 2; index < splits.size(); index++)
    {
      FileSplit previousSplit = (FileSplit) splits.get(index - 1);
      FileSplit currSplit = (FileSplit) splits.get(index);
      // If this index is the last split, or we've moved on to splits from a different
      // file, then we need to adjust the last split for that file.
      int lastFileIndex = -1;
      if (index == splits.size() - 1)
      {
        lastFileIndex = index;
      }
      else if (!currSplit.getPath().equals(previousSplit.getPath()))
      {
        lastFileIndex = index - 1;
      }
      if (lastFileIndex >= 2)
      {
        FileSplit lastFileSplit = (FileSplit) splits.get(lastFileIndex);
        FileSplit priorSplit = (FileSplit) splits.get(lastFileIndex - 1);
        if (lastFileSplit.getPath().equals(priorSplit.getPath()))
        {
          if (priorSplit.getPath().equals(lastFileSplit.getPath()) &&
              priorSplit.getStart() + priorSplit.getLength() < lastFileSplit.getStart())
          {
            // Adjust the start of previous split
            FileSplit replacement = new FileSplit(lastFileSplit.getPath(),
                priorSplit.getStart() + priorSplit.getLength(),
                lastFileSplit.getLength() + 1,
                lastFileSplit.getLocations());
            log.info("Replacing split: " + lastFileSplit);
            log.info("  With split: " + replacement);
            splits.set(lastFileIndex, replacement);
          }
        }
      }
    }
    return splits;
  }
  else
  {
    return new TextInputFormat().getSplits(context);
  }
}
 
Example 11
Source File: ParquetInputSplit.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
  return new ParquetInputSplit(split.getPath(),
      split.getStart(), split.getStart() + split.getLength(),
      split.getLength(), split.getLocations(), null);
}