org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex Java Exaples

Source File: MRInputUtils.java From incubator-tez with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
    TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter)
    throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  long offset = splitMetaInfo.getStartOffset();

  // Split information read from local filesystem.
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapreduce.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}

Source File: MRInputUtils.java From incubator-tez with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo,
    JobConf jobConf, TezCounter splitBytesCounter) throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  long offset = splitMetaInfo.getStartOffset();

  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapred.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}

Source File: MRInput.java From incubator-tez with Apache License 2.0

5 votes

@Private
void initializeInternal() throws IOException {
  // Primarily for visibility
  rrLock.lock();
  try {
    
    if (splitInfoViaEvents) {
      if (useNewApi) {
        mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
            getContext().getApplicationId().getClusterTimestamp(), getContext()
                .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
                .getTaskIndex(), getContext().getTaskAttemptNumber());
      } else {
        mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter);
      }
    } else {
      TaskSplitMetaInfo[] allMetaInfo = MRInputUtils.readSplits(jobConf);
      TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[getContext().getTaskIndex()];
      TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
          thisTaskMetaInfo.getStartOffset());
      if (useNewApi) {
        org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
            .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
            inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
            getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
            getContext().getTaskIndex(), getContext().getTaskAttemptNumber());
      } else {
        org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
            .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(), inputRecordCounter);
      }
    }
  } finally {
    rrLock.unlock();
  }
  LOG.info("Initialzed MRInput: " + getContext().getSourceVertexName());
}

Source File: MRInputUtils.java From tez with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk(
    TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter)
    throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  long offset = splitMetaInfo.getStartOffset();

  // Split information read from local filesystem.
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapreduce.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}

Source File: MRInputUtils.java From tez with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo,
    JobConf jobConf, TezCounter splitBytesCounter) throws IOException {
  Path file = new Path(splitMetaInfo.getSplitLocation());
  FileSystem fs = FileSystem.getLocal(jobConf);
  file = fs.makeQualified(file);
  LOG.info("Reading input split file from : " + file);
  long offset = splitMetaInfo.getStartOffset();

  FSDataInputStream inFile = fs.open(file);
  inFile.seek(offset);
  String className = Text.readString(inFile);
  Class<org.apache.hadoop.mapred.InputSplit> cls;
  try {
    cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className);
  } catch (ClassNotFoundException ce) {
    IOException wrap = new IOException("Split class " + className + " not found");
    wrap.initCause(ce);
    throw wrap;
  }
  SerializationFactory factory = new SerializationFactory(jobConf);
  Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory
      .getDeserializer(cls);
  deserializer.open(inFile);
  org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null);
  long pos = inFile.getPos();
  if (splitBytesCounter != null) {
    splitBytesCounter.increment(pos - offset);
  }
  inFile.close();
  return split;
}

Source File: TestMapProgress.java From hadoop with Apache License 2.0

4 votes

public TestMapTask(String jobFile, TaskAttemptID taskId, 
    int partition, TaskSplitIndex splitIndex,
    int numSlotsRequired) {
  super(jobFile, taskId, partition, splitIndex, numSlotsRequired);
}

Source File: MapTask.java From hadoop with Apache License 2.0

4 votes

public MapTask(String jobFile, TaskAttemptID taskId, 
               int partition, TaskSplitIndex splitIndex,
               int numSlotsRequired) {
  super(jobFile, taskId, partition, numSlotsRequired);
  this.splitMetaInfo = splitIndex;
}

Source File: MapTask.java From hadoop with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runOldMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, InterruptedException,
                           ClassNotFoundException {
  InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
         splitIndex.getStartOffset());

  updateJobWithSplit(job, inputSplit);
  reporter.setInputSplit(inputSplit);

  RecordReader<INKEY,INVALUE> in = isSkipping() ? 
      new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) :
        new TrackedRecordReader<INKEY,INVALUE>(reporter, job);
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());


  int numReduceTasks = conf.getNumReduceTasks();
  LOG.info("numReduceTasks: " + numReduceTasks);
  MapOutputCollector<OUTKEY, OUTVALUE> collector = null;
  if (numReduceTasks > 0) {
    collector = createSortingCollector(job, reporter);
  } else { 
    collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>();
     MapOutputCollector.Context context =
                         new MapOutputCollector.Context(this, job, reporter);
    collector.init(context);
  }
  MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner =
    ReflectionUtils.newInstance(job.getMapRunnerClass(), job);

  try {
    runner.run(in, new OldOutputCollector(collector, conf), reporter);
    mapPhase.complete();
    // start the sort phase only if there are reducers
    if (numReduceTasks > 0) {
      setPhase(TaskStatus.Phase.SORT);
    }
    statusUpdate(umbilical);
    collector.flush();
    
    in.close();
    in = null;
    
    collector.close();
    collector = null;
  } finally {
    closeQuietly(in);
    closeQuietly(collector);
  }
}

Source File: MapTask.java From hadoop with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, ClassNotFoundException,
                           InterruptedException {
  // make a task context so we can get the classes
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                getTaskID(),
                                                                reporter);
  // make a mapper
  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
    (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  // make the input format
  org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
  // rebuild the input split
  org.apache.hadoop.mapreduce.InputSplit split = null;
  split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
      splitIndex.getStartOffset());
  LOG.info("Processing split: " + split);

  org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
    new NewTrackingRecordReader<INKEY,INVALUE>
      (split, inputFormat, reporter, taskContext);
  
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.RecordWriter output = null;
  
  // get an output object
  if (job.getNumReduceTasks() == 0) {
    output = 
      new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
  } else {
    output = new NewOutputCollector(taskContext, job, umbilical, reporter);
  }

  org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
  mapContext = 
    new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
        input, output, 
        committer, 
        reporter, split);

  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
      mapperContext = 
        new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
            mapContext);

  try {
    input.initialize(split, mapperContext);
    mapper.run(mapperContext);
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}

Source File: TestMapProgress.java From big-c with Apache License 2.0

4 votes

public TestMapTask(String jobFile, TaskAttemptID taskId, 
    int partition, TaskSplitIndex splitIndex,
    int numSlotsRequired) {
  super(jobFile, taskId, partition, splitIndex, numSlotsRequired);
}

Source File: MapTask.java From big-c with Apache License 2.0

4 votes

public MapTask(String jobFile, TaskAttemptID taskId, 
               int partition, TaskSplitIndex splitIndex,
               int numSlotsRequired) {
  super(jobFile, taskId, partition, numSlotsRequired);
  this.splitMetaInfo = splitIndex;
}

Source File: MapTask.java From big-c with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runOldMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, InterruptedException,
                           ClassNotFoundException {
  InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()),
         splitIndex.getStartOffset());

  updateJobWithSplit(job, inputSplit);
  reporter.setInputSplit(inputSplit);

  RecordReader<INKEY,INVALUE> in = isSkipping() ? 
      new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) :
        new TrackedRecordReader<INKEY,INVALUE>(reporter, job);
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());


  int numReduceTasks = conf.getNumReduceTasks();
  LOG.info("numReduceTasks: " + numReduceTasks);
  MapOutputCollector<OUTKEY, OUTVALUE> collector = null;
  if (numReduceTasks > 0) {
    collector = createSortingCollector(job, reporter);
  } else { 
    collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>();
     MapOutputCollector.Context context =
                         new MapOutputCollector.Context(this, job, reporter);
    collector.init(context);
  }
  MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner =
    ReflectionUtils.newInstance(job.getMapRunnerClass(), job);

  try {
    runner.run(in, new OldOutputCollector(collector, conf), reporter);
    mapPhase.complete();
    // start the sort phase only if there are reducers
    if (numReduceTasks > 0) {
      setPhase(TaskStatus.Phase.SORT);
    }
    statusUpdate(umbilical);
    collector.flush();
    
    in.close();
    in = null;
    
    collector.close();
    collector = null;
  } finally {
    closeQuietly(in);
    closeQuietly(collector);
  }
}

Source File: MapTask.java From big-c with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
private <INKEY,INVALUE,OUTKEY,OUTVALUE>
void runNewMapper(final JobConf job,
                  final TaskSplitIndex splitIndex,
                  final TaskUmbilicalProtocol umbilical,
                  TaskReporter reporter
                  ) throws IOException, ClassNotFoundException,
                           InterruptedException {
  // make a task context so we can get the classes
  org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
    new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, 
                                                                getTaskID(),
                                                                reporter);
  // make a mapper
  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper =
    (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>)
      ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
  // make the input format
  org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat =
    (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>)
      ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
  // rebuild the input split
  org.apache.hadoop.mapreduce.InputSplit split = null;
  split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
      splitIndex.getStartOffset());
  LOG.info("Processing split: " + split);

  org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
    new NewTrackingRecordReader<INKEY,INVALUE>
      (split, inputFormat, reporter, taskContext);
  
  job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
  org.apache.hadoop.mapreduce.RecordWriter output = null;
  
  // get an output object
  if (job.getNumReduceTasks() == 0) {
    output = 
      new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
  } else {
    output = new NewOutputCollector(taskContext, job, umbilical, reporter);
  }

  org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> 
  mapContext = 
    new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), 
        input, output, 
        committer, 
        reporter, split);

  org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context 
      mapperContext = 
        new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext(
            mapContext);

  try {
    input.initialize(split, mapperContext);
    mapper.run(mapperContext);
    mapPhase.complete();
    setPhase(TaskStatus.Phase.SORT);
    statusUpdate(umbilical);
    input.close();
    input = null;
    output.close(mapperContext);
    output = null;
  } finally {
    closeQuietly(input);
    closeQuietly(output, mapperContext);
  }
}

Source File: MRInput.java From tez with Apache License 2.0

4 votes

@Private
void initializeInternal() throws IOException {
  // Primarily for visibility
  rrLock.lock();
  try {
    
    if (splitInfoViaEvents) {
      if (useNewApi) {
        mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter,
            getContext().getApplicationId().getClusterTimestamp(), getContext()
                .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext()
                .getTaskIndex(), getContext().getTaskAttemptNumber(), getContext());
      } else {
        mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter, 
            getContext());
      }
    } else {
      TaskSplitMetaInfo thisTaskMetaInfo = MRInputUtils.getSplits(jobConf,
          getContext().getTaskIndex());
      TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(),
          thisTaskMetaInfo.getStartOffset());
      long splitLength = -1;
      if (useNewApi) {
        org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils
            .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        try {
          splitLength = newInputSplit.getLength();
        } catch (InterruptedException e) {
          LOG.warn("Got interrupted while reading split length: ", e);
        }
        mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(),
            inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(),
            getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
            getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), getContext());
      } else {
        org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils
            .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters()
                .findCounter(TaskCounter.SPLIT_RAW_BYTES));
        splitLength = oldInputSplit.getLength();
        mrReader =
            new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(),
                inputRecordCounter, getContext());
      }
      if (splitLength != -1) {
        getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES)
            .increment(splitLength);
      }
    }
  } finally {
    rrLock.unlock();
  }
  LOG.info("Initialized MRInput: " + getContext().getSourceVertexName());
}

org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex Java Examples