org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MRInputUtils.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
    TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter)
    throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  long offset = splitMetaInfo.getStartOffset();

  // Split information read from local filesystem.
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapreduce.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}
 
Example #2
Source File: MRInputUtils.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo,
    JobConf jobConf, TezCounter splitBytesCounter) throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  long offset = splitMetaInfo.getStartOffset();

  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapred.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}
 
Example #3
Source File: MRInput.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
@Private
void initializeInternal() throws IOException {
  // Primarily for visibility
  rrLock.lock();
  try {
    
    if (splitInfoViaEvents) {
      if (useNewApi) {
        mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
            getContext().getApplicationId().getClusterTimestamp(), getContext()
                .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
                .getTaskIndex(), getContext().getTaskAttemptNumber());
      } else {
        mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter);
      }
    } else {
      TaskSplitMetaInfo[] allMetaInfo = MRInputUtils.readSplits(jobConf);
      TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[getContext().getTaskIndex()];
      TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
          thisTaskMetaInfo.getStartOffset());
      if (useNewApi) {
        org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
            .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
            inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
            getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
            getContext().getTaskIndex(), getContext().getTaskAttemptNumber());
      } else {
        org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
            .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(), inputRecordCounter);
      }
    }
  } finally {
    rrLock.unlock();
  }
  LOG.info("Initialzed MRInput: " + getContext().getSourceVertexName());
}
 
Example #4
Source File: MRInputUtils.java    From tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
    TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter)
    throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  long offset = splitMetaInfo.getStartOffset();

  // Split information read from local filesystem.
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapreduce.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}
 
Example #5
Source File: MRInputUtils.java    From tez with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo,
    JobConf jobConf, TezCounter splitBytesCounter) throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  long offset = splitMetaInfo.getStartOffset();

  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapred.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}
 
Example #6
Source File: TestMapProgress.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public TestMapTask(String jobFile, TaskAttemptID taskId, 
    int partition, TaskSplitIndex splitIndex,
    int numSlotsRequired) {
  super(jobFile, taskId, partition, splitIndex, numSlotsRequired);
}
 
Example #7
Source File: MapTask.java    From hadoop with Apache License 2.0 4 votes vote down vote up
public MapTask(String jobFile, TaskAttemptID taskId, 
               int partition, TaskSplitIndex splitIndex,
               int numSlotsRequired) {
  super(jobFile, taskId, partition, numSlotsRequired);
  this.splitMetaInfo = splitIndex;
}
 
Example #8
Source File: MapTask.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runOldMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, InterruptedException,
                           ClassNotFoundException {
  InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
         splitIndex.getStartOffset());

  updateJobWithSplit(job, inputSplit);
  reporter.setInputSplit(inputSplit);

  RecordReader<INKEY,INVALUE> in = isSkipping() ? 
      new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) :
        new TrackedRecordReader<INKEY,INVALUE>(reporter, job);
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());


  int numReduceTasks = conf.getNumReduceTasks();
  LOG.info("numReduceTasks: " + numReduceTasks);
  MapOutputCollector<OUTKEY, OUTVALUE> collector = null;
  if (numReduceTasks > 0) {
    collector = createSortingCollector(job, reporter);
  } else { 
    collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>();
     MapOutputCollector.Context context =
                         new MapOutputCollector.Context(this, job, reporter);
    collector.init(context);
  }
  MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner =
    ReflectionUtils.newInstance(job.getMapRunnerClass(), job);

  try {
    runner.run(in, new OldOutputCollector(collector, conf), reporter);
    mapPhase.complete();
    // start the sort phase only if there are reducers
    if (numReduceTasks > 0) {
      setPhase(TaskStatus.Phase.SORT);
    }
    statusUpdate(umbilical);
    collector.flush();
    
    in.close();
    in = null;
    
    collector.close();
    collector = null;
  } finally {
    closeQuietly(in);
    closeQuietly(collector);
  }
}
 
Example #9
Source File: MapTask.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, ClassNotFoundException,
                           InterruptedException {
  // make a task context so we can get the classes
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                getTaskID(),
                                                                reporter);
  // make a mapper
  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
    (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  // make the input format
  org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
  // rebuild the input split
  org.apache.hadoop.mapreduce.InputSplit split = null;
  split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
      splitIndex.getStartOffset());
  LOG.info("Processing split: " + split);

  org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
    new NewTrackingRecordReader<INKEY,INVALUE>
      (split, inputFormat, reporter, taskContext);
  
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.RecordWriter output = null;
  
  // get an output object
  if (job.getNumReduceTasks() == 0) {
    output = 
      new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
  } else {
    output = new NewOutputCollector(taskContext, job, umbilical, reporter);
  }

  org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
  mapContext = 
    new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
        input, output, 
        committer, 
        reporter, split);

  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
      mapperContext = 
        new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
            mapContext);

  try {
    input.initialize(split, mapperContext);
    mapper.run(mapperContext);
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}
 
Example #10
Source File: TestMapProgress.java    From big-c with Apache License 2.0 4 votes vote down vote up
public TestMapTask(String jobFile, TaskAttemptID taskId, 
    int partition, TaskSplitIndex splitIndex,
    int numSlotsRequired) {
  super(jobFile, taskId, partition, splitIndex, numSlotsRequired);
}
 
Example #11
Source File: MapTask.java    From big-c with Apache License 2.0 4 votes vote down vote up
public MapTask(String jobFile, TaskAttemptID taskId, 
               int partition, TaskSplitIndex splitIndex,
               int numSlotsRequired) {
  super(jobFile, taskId, partition, numSlotsRequired);
  this.splitMetaInfo = splitIndex;
}
 
Example #12
Source File: MapTask.java    From big-c with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runOldMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, InterruptedException,
                           ClassNotFoundException {
  InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
         splitIndex.getStartOffset());

  updateJobWithSplit(job, inputSplit);
  reporter.setInputSplit(inputSplit);

  RecordReader<INKEY,INVALUE> in = isSkipping() ? 
      new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) :
        new TrackedRecordReader<INKEY,INVALUE>(reporter, job);
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());


  int numReduceTasks = conf.getNumReduceTasks();
  LOG.info("numReduceTasks: " + numReduceTasks);
  MapOutputCollector<OUTKEY, OUTVALUE> collector = null;
  if (numReduceTasks > 0) {
    collector = createSortingCollector(job, reporter);
  } else { 
    collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>();
     MapOutputCollector.Context context =
                         new MapOutputCollector.Context(this, job, reporter);
    collector.init(context);
  }
  MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner =
    ReflectionUtils.newInstance(job.getMapRunnerClass(), job);

  try {
    runner.run(in, new OldOutputCollector(collector, conf), reporter);
    mapPhase.complete();
    // start the sort phase only if there are reducers
    if (numReduceTasks > 0) {
      setPhase(TaskStatus.Phase.SORT);
    }
    statusUpdate(umbilical);
    collector.flush();
    
    in.close();
    in = null;
    
    collector.close();
    collector = null;
  } finally {
    closeQuietly(in);
    closeQuietly(collector);
  }
}
 
Example #13
Source File: MapTask.java    From big-c with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, ClassNotFoundException,
                           InterruptedException {
  // make a task context so we can get the classes
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                getTaskID(),
                                                                reporter);
  // make a mapper
  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
    (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  // make the input format
  org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
  // rebuild the input split
  org.apache.hadoop.mapreduce.InputSplit split = null;
  split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
      splitIndex.getStartOffset());
  LOG.info("Processing split: " + split);

  org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
    new NewTrackingRecordReader<INKEY,INVALUE>
      (split, inputFormat, reporter, taskContext);
  
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.RecordWriter output = null;
  
  // get an output object
  if (job.getNumReduceTasks() == 0) {
    output = 
      new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
  } else {
    output = new NewOutputCollector(taskContext, job, umbilical, reporter);
  }

  org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
  mapContext = 
    new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
        input, output, 
        committer, 
        reporter, split);

  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
      mapperContext = 
        new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
            mapContext);

  try {
    input.initialize(split, mapperContext);
    mapper.run(mapperContext);
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}
 
Example #14
Source File: MRInput.java    From tez with Apache License 2.0 4 votes vote down vote up
@Private
void initializeInternal() throws IOException {
  // Primarily for visibility
  rrLock.lock();
  try {
    
    if (splitInfoViaEvents) {
      if (useNewApi) {
        mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
            getContext().getApplicationId().getClusterTimestamp(), getContext()
                .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
                .getTaskIndex(), getContext().getTaskAttemptNumber(), getContext());
      } else {
        mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter, 
            getContext());
      }
    } else {
      TaskSplitMetaInfo thisTaskMetaInfo = MRInputUtils.getSplits(jobConf,
          getContext().getTaskIndex());
      TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
          thisTaskMetaInfo.getStartOffset());
      long splitLength = -1;
      if (useNewApi) {
        org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
            .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        try {
          splitLength = newInputSplit.getLength();
        } catch (InterruptedException e) {
          LOG.warn("Got interrupted while reading split length: ", e);
        }
        mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
            inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
            getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
            getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), getContext());
      } else {
        org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
            .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        splitLength = oldInputSplit.getLength();
        mrReader =
            new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(),
                inputRecordCounter, getContext());
      }
      if (splitLength != -1) {
        getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES)
            .increment(splitLength);
      }
    }
  } finally {
    rrLock.unlock();
  }
  LOG.info("Initialized MRInput: " + getContext().getSourceVertexName());
}