org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex Java Examples
The following examples show how to use
org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MRInputUtils.java From incubator-tez with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk( TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter) throws IOException { Path file = new Path(splitMetaInfo.getSplitLocation()); long offset = splitMetaInfo.getStartOffset(); // Split information read from local filesystem. FileSystem fs = FileSystem.getLocal(jobConf); file = fs.makeQualified(file); LOG.info("Reading input split file from : " + file); FSDataInputStream inFile = fs.open(file); inFile.seek(offset); String className = Text.readString(inFile); Class<org.apache.hadoop.mapreduce.InputSplit> cls; try { cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className); } catch (ClassNotFoundException ce) { IOException wrap = new IOException("Split class " + className + " not found"); wrap.initCause(ce); throw wrap; } SerializationFactory factory = new SerializationFactory(jobConf); Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory .getDeserializer(cls); deserializer.open(inFile); org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null); long pos = inFile.getPos(); if (splitBytesCounter != null) { splitBytesCounter.increment(pos - offset); } inFile.close(); return split; }
Example #2
Source File: MRInputUtils.java From incubator-tez with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter) throws IOException { Path file = new Path(splitMetaInfo.getSplitLocation()); FileSystem fs = FileSystem.getLocal(jobConf); file = fs.makeQualified(file); LOG.info("Reading input split file from : " + file); long offset = splitMetaInfo.getStartOffset(); FSDataInputStream inFile = fs.open(file); inFile.seek(offset); String className = Text.readString(inFile); Class<org.apache.hadoop.mapred.InputSplit> cls; try { cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className); } catch (ClassNotFoundException ce) { IOException wrap = new IOException("Split class " + className + " not found"); wrap.initCause(ce); throw wrap; } SerializationFactory factory = new SerializationFactory(jobConf); Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory .getDeserializer(cls); deserializer.open(inFile); org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null); long pos = inFile.getPos(); if (splitBytesCounter != null) { splitBytesCounter.increment(pos - offset); } inFile.close(); return split; }
Example #3
Source File: MRInput.java From incubator-tez with Apache License 2.0 | 5 votes |
@Private void initializeInternal() throws IOException { // Primarily for visibility rrLock.lock(); try { if (splitInfoViaEvents) { if (useNewApi) { mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(), getContext() .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext() .getTaskIndex(), getContext().getTaskAttemptNumber()); } else { mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter); } } else { TaskSplitMetaInfo[] allMetaInfo = MRInputUtils.readSplits(jobConf); TaskSplitMetaInfo thisTaskMetaInfo = allMetaInfo[getContext().getTaskIndex()]; TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(), thisTaskMetaInfo.getStartOffset()); if (useNewApi) { org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters() .findCounter(TaskCounter.SPLIT_RAW_BYTES)); mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(), inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(), getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext().getTaskIndex(), getContext().getTaskAttemptNumber()); } else { org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters() .findCounter(TaskCounter.SPLIT_RAW_BYTES)); mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(), inputRecordCounter); } } } finally { rrLock.unlock(); } LOG.info("Initialzed MRInput: " + getContext().getSourceVertexName()); }
Example #4
Source File: MRInputUtils.java From tez with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public static org.apache.hadoop.mapreduce.InputSplit getNewSplitDetailsFromDisk( TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter) throws IOException { Path file = new Path(splitMetaInfo.getSplitLocation()); long offset = splitMetaInfo.getStartOffset(); // Split information read from local filesystem. FileSystem fs = FileSystem.getLocal(jobConf); file = fs.makeQualified(file); LOG.info("Reading input split file from : " + file); FSDataInputStream inFile = fs.open(file); inFile.seek(offset); String className = Text.readString(inFile); Class<org.apache.hadoop.mapreduce.InputSplit> cls; try { cls = (Class<org.apache.hadoop.mapreduce.InputSplit>) jobConf.getClassByName(className); } catch (ClassNotFoundException ce) { IOException wrap = new IOException("Split class " + className + " not found"); wrap.initCause(ce); throw wrap; } SerializationFactory factory = new SerializationFactory(jobConf); Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapreduce.InputSplit>) factory .getDeserializer(cls); deserializer.open(inFile); org.apache.hadoop.mapreduce.InputSplit split = deserializer.deserialize(null); long pos = inFile.getPos(); if (splitBytesCounter != null) { splitBytesCounter.increment(pos - offset); } inFile.close(); return split; }
Example #5
Source File: MRInputUtils.java From tez with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public static InputSplit getOldSplitDetailsFromDisk(TaskSplitIndex splitMetaInfo, JobConf jobConf, TezCounter splitBytesCounter) throws IOException { Path file = new Path(splitMetaInfo.getSplitLocation()); FileSystem fs = FileSystem.getLocal(jobConf); file = fs.makeQualified(file); LOG.info("Reading input split file from : " + file); long offset = splitMetaInfo.getStartOffset(); FSDataInputStream inFile = fs.open(file); inFile.seek(offset); String className = Text.readString(inFile); Class<org.apache.hadoop.mapred.InputSplit> cls; try { cls = (Class<org.apache.hadoop.mapred.InputSplit>) jobConf.getClassByName(className); } catch (ClassNotFoundException ce) { IOException wrap = new IOException("Split class " + className + " not found"); wrap.initCause(ce); throw wrap; } SerializationFactory factory = new SerializationFactory(jobConf); Deserializer<org.apache.hadoop.mapred.InputSplit> deserializer = (Deserializer<org.apache.hadoop.mapred.InputSplit>) factory .getDeserializer(cls); deserializer.open(inFile); org.apache.hadoop.mapred.InputSplit split = deserializer.deserialize(null); long pos = inFile.getPos(); if (splitBytesCounter != null) { splitBytesCounter.increment(pos - offset); } inFile.close(); return split; }
Example #6
Source File: TestMapProgress.java From hadoop with Apache License 2.0 | 4 votes |
public TestMapTask(String jobFile, TaskAttemptID taskId, int partition, TaskSplitIndex splitIndex, int numSlotsRequired) { super(jobFile, taskId, partition, splitIndex, numSlotsRequired); }
Example #7
Source File: MapTask.java From hadoop with Apache License 2.0 | 4 votes |
public MapTask(String jobFile, TaskAttemptID taskId, int partition, TaskSplitIndex splitIndex, int numSlotsRequired) { super(jobFile, taskId, partition, numSlotsRequired); this.splitMetaInfo = splitIndex; }
Example #8
Source File: MapTask.java From hadoop with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runOldMapper(final JobConf job, final TaskSplitIndex splitIndex, final TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, InterruptedException, ClassNotFoundException { InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); updateJobWithSplit(job, inputSplit); reporter.setInputSplit(inputSplit); RecordReader<INKEY,INVALUE> in = isSkipping() ? new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) : new TrackedRecordReader<INKEY,INVALUE>(reporter, job); job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); int numReduceTasks = conf.getNumReduceTasks(); LOG.info("numReduceTasks: " + numReduceTasks); MapOutputCollector<OUTKEY, OUTVALUE> collector = null; if (numReduceTasks > 0) { collector = createSortingCollector(job, reporter); } else { collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>(); MapOutputCollector.Context context = new MapOutputCollector.Context(this, job, reporter); collector.init(context); } MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner = ReflectionUtils.newInstance(job.getMapRunnerClass(), job); try { runner.run(in, new OldOutputCollector(collector, conf), reporter); mapPhase.complete(); // start the sort phase only if there are reducers if (numReduceTasks > 0) { setPhase(TaskStatus.Phase.SORT); } statusUpdate(umbilical); collector.flush(); in.close(); in = null; collector.close(); collector = null; } finally { closeQuietly(in); closeQuietly(collector); } }
Example #9
Source File: MapTask.java From hadoop with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runNewMapper(final JobConf job, final TaskSplitIndex splitIndex, final TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, ClassNotFoundException, InterruptedException { // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, getTaskID(), reporter); // make a mapper org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>) ReflectionUtils.newInstance(taskContext.getMapperClass(), job); // make the input format org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat = (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>) ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job); // rebuild the input split org.apache.hadoop.mapreduce.InputSplit split = null; split = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); LOG.info("Processing split: " + split); org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input = new NewTrackingRecordReader<INKEY,INVALUE> (split, inputFormat, reporter, taskContext); job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); org.apache.hadoop.mapreduce.RecordWriter output = null; // get an output object if (job.getNumReduceTasks() == 0) { output = new NewDirectOutputCollector(taskContext, job, umbilical, reporter); } else { output = new NewOutputCollector(taskContext, job, umbilical, reporter); } org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), input, output, committer, reporter, split); org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext( mapContext); try { input.initialize(split, mapperContext); mapper.run(mapperContext); mapPhase.complete(); setPhase(TaskStatus.Phase.SORT); statusUpdate(umbilical); input.close(); input = null; output.close(mapperContext); output = null; } finally { closeQuietly(input); closeQuietly(output, mapperContext); } }
Example #10
Source File: TestMapProgress.java From big-c with Apache License 2.0 | 4 votes |
public TestMapTask(String jobFile, TaskAttemptID taskId, int partition, TaskSplitIndex splitIndex, int numSlotsRequired) { super(jobFile, taskId, partition, splitIndex, numSlotsRequired); }
Example #11
Source File: MapTask.java From big-c with Apache License 2.0 | 4 votes |
public MapTask(String jobFile, TaskAttemptID taskId, int partition, TaskSplitIndex splitIndex, int numSlotsRequired) { super(jobFile, taskId, partition, numSlotsRequired); this.splitMetaInfo = splitIndex; }
Example #12
Source File: MapTask.java From big-c with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runOldMapper(final JobConf job, final TaskSplitIndex splitIndex, final TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, InterruptedException, ClassNotFoundException { InputSplit inputSplit = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); updateJobWithSplit(job, inputSplit); reporter.setInputSplit(inputSplit); RecordReader<INKEY,INVALUE> in = isSkipping() ? new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter, job) : new TrackedRecordReader<INKEY,INVALUE>(reporter, job); job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); int numReduceTasks = conf.getNumReduceTasks(); LOG.info("numReduceTasks: " + numReduceTasks); MapOutputCollector<OUTKEY, OUTVALUE> collector = null; if (numReduceTasks > 0) { collector = createSortingCollector(job, reporter); } else { collector = new DirectMapOutputCollector<OUTKEY, OUTVALUE>(); MapOutputCollector.Context context = new MapOutputCollector.Context(this, job, reporter); collector.init(context); } MapRunnable<INKEY,INVALUE,OUTKEY,OUTVALUE> runner = ReflectionUtils.newInstance(job.getMapRunnerClass(), job); try { runner.run(in, new OldOutputCollector(collector, conf), reporter); mapPhase.complete(); // start the sort phase only if there are reducers if (numReduceTasks > 0) { setPhase(TaskStatus.Phase.SORT); } statusUpdate(umbilical); collector.flush(); in.close(); in = null; collector.close(); collector = null; } finally { closeQuietly(in); closeQuietly(collector); } }
Example #13
Source File: MapTask.java From big-c with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private <INKEY,INVALUE,OUTKEY,OUTVALUE> void runNewMapper(final JobConf job, final TaskSplitIndex splitIndex, final TaskUmbilicalProtocol umbilical, TaskReporter reporter ) throws IOException, ClassNotFoundException, InterruptedException { // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job, getTaskID(), reporter); // make a mapper org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE> mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>) ReflectionUtils.newInstance(taskContext.getMapperClass(), job); // make the input format org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE> inputFormat = (org.apache.hadoop.mapreduce.InputFormat<INKEY,INVALUE>) ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job); // rebuild the input split org.apache.hadoop.mapreduce.InputSplit split = null; split = getSplitDetails(new Path(splitIndex.getSplitLocation()), splitIndex.getStartOffset()); LOG.info("Processing split: " + split); org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input = new NewTrackingRecordReader<INKEY,INVALUE> (split, inputFormat, reporter, taskContext); job.setBoolean(JobContext.SKIP_RECORDS, isSkipping()); org.apache.hadoop.mapreduce.RecordWriter output = null; // get an output object if (job.getNumReduceTasks() == 0) { output = new NewDirectOutputCollector(taskContext, job, umbilical, reporter); } else { output = new NewOutputCollector(taskContext, job, umbilical, reporter); } org.apache.hadoop.mapreduce.MapContext<INKEY, INVALUE, OUTKEY, OUTVALUE> mapContext = new MapContextImpl<INKEY, INVALUE, OUTKEY, OUTVALUE>(job, getTaskID(), input, output, committer, reporter, split); org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context mapperContext = new WrappedMapper<INKEY, INVALUE, OUTKEY, OUTVALUE>().getMapContext( mapContext); try { input.initialize(split, mapperContext); mapper.run(mapperContext); mapPhase.complete(); setPhase(TaskStatus.Phase.SORT); statusUpdate(umbilical); input.close(); input = null; output.close(mapperContext); output = null; } finally { closeQuietly(input); closeQuietly(output, mapperContext); } }
Example #14
Source File: MRInput.java From tez with Apache License 2.0 | 4 votes |
@Private void initializeInternal() throws IOException { // Primarily for visibility rrLock.lock(); try { if (splitInfoViaEvents) { if (useNewApi) { mrReader = new MRReaderMapReduce(jobConf, getContext().getCounters(), inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(), getContext() .getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext() .getTaskIndex(), getContext().getTaskAttemptNumber(), getContext()); } else { mrReader = new MRReaderMapred(jobConf, getContext().getCounters(), inputRecordCounter, getContext()); } } else { TaskSplitMetaInfo thisTaskMetaInfo = MRInputUtils.getSplits(jobConf, getContext().getTaskIndex()); TaskSplitIndex splitMetaInfo = new TaskSplitIndex(thisTaskMetaInfo.getSplitLocation(), thisTaskMetaInfo.getStartOffset()); long splitLength = -1; if (useNewApi) { org.apache.hadoop.mapreduce.InputSplit newInputSplit = MRInputUtils .getNewSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters() .findCounter(TaskCounter.SPLIT_RAW_BYTES)); try { splitLength = newInputSplit.getLength(); } catch (InterruptedException e) { LOG.warn("Got interrupted while reading split length: ", e); } mrReader = new MRReaderMapReduce(jobConf, newInputSplit, getContext().getCounters(), inputRecordCounter, getContext().getApplicationId().getClusterTimestamp(), getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(), getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), getContext()); } else { org.apache.hadoop.mapred.InputSplit oldInputSplit = MRInputUtils .getOldSplitDetailsFromDisk(splitMetaInfo, jobConf, getContext().getCounters() .findCounter(TaskCounter.SPLIT_RAW_BYTES)); splitLength = oldInputSplit.getLength(); mrReader = new MRReaderMapred(jobConf, oldInputSplit, getContext().getCounters(), inputRecordCounter, getContext()); } if (splitLength != -1) { getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES) .increment(splitLength); } } } finally { rrLock.unlock(); } LOG.info("Initialized MRInput: " + getContext().getSourceVertexName()); }