org.apache.tez.mapreduce.input.MRInput Java Examples

The following examples show how to use org.apache.tez.mapreduce.input.MRInput. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: POSimpleTezLoad.java From spork with Apache License 2.0

6 votes

@Override
public void attachInputs(Map<String, LogicalInput> inputs,
        Configuration conf)
        throws ExecException {
    this.conf = conf;
    LogicalInput logInput = inputs.get(inputKey);
    if (logInput == null || !(logInput instanceof MRInput)) {
        throw new ExecException("POSimpleTezLoad only accepts MRInputs");
    }
    input = (MRInput) logInput;
    try {
        reader = input.getReader();
        // Set split index, MergeCoGroup need it. And this input is the only input of the
        // MergeCoGroup vertex.
        if (reader instanceof MRReader) {
            int splitIndex = ((PigSplit)((MRReader)reader).getSplit()).getSplitIndex();
            PigMapReduce.sJobContext.getConfiguration().setInt(PigImplConstants.PIG_SPLIT_INDEX, splitIndex);
        }
    } catch (IOException e) {
        throw new ExecException(e);
    }
}

Example #2

Source File: WordCount.java From incubator-tez with Apache License 2.0

6 votes

@Override
public void run() throws Exception {
  Preconditions.checkArgument(getInputs().size() == 1);
  Preconditions.checkArgument(getOutputs().size() == 1);
  MRInput input = (MRInput) getInputs().values().iterator().next();
  KeyValueReader kvReader = input.getReader();
  Output output = getOutputs().values().iterator().next();
  KeyValueWriter kvWriter = (KeyValueWriter) output.getWriter();
  while (kvReader.next()) {
    StringTokenizer itr = new StringTokenizer(kvReader.getCurrentValue().toString());
    while (itr.hasMoreTokens()) {
      word.set(itr.nextToken());
      kvWriter.write(word, one);
    }
  }
}

Example #3

Source File: YARNRunner.java From tez with Apache License 2.0

6 votes

@Private
private static DataSourceDescriptor configureMRInputWithLegacySplitsGenerated(Configuration conf,
                                                                              boolean useLegacyInput) {
  InputDescriptor inputDescriptor;

  try {
    inputDescriptor = InputDescriptor.create(useLegacyInput ? MRInputLegacy.class
        .getName() : MRInput.class.getName())
        .setUserPayload(MRInputHelpersInternal.createMRInputPayload(conf, null));
  } catch (IOException e) {
    throw new TezUncheckedException(e);
  }

  DataSourceDescriptor dsd = DataSourceDescriptor.create(inputDescriptor, null, null);
  if (conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT,
      TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) {
    dsd.getInputDescriptor().setHistoryText(TezUtils.convertToHistoryText(conf));
  }

  return dsd;
}

Example #4

Source File: WordCount.java From incubator-tez with Apache License 2.0

5 votes

private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {

  Configuration inputConf = new Configuration(tezConf);
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  InputDescriptor id = new InputDescriptor(MRInput.class.getName())
      .setUserPayload(MRInput.createUserPayload(inputConf,
          TextInputFormat.class.getName(), true, true));

  Configuration outputConf = new Configuration(tezConf);
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  OutputDescriptor od = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        outputConf, TextOutputFormat.class.getName(), true));

  Vertex tokenizerVertex = new Vertex("tokenizer", new ProcessorDescriptor(
      TokenProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf));
  tokenizerVertex.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex summerVertex = new Vertex("summer",
      new ProcessorDescriptor(
          SumProcessor.class.getName()), 1, MRHelpers.getReduceResource(tezConf));
  summerVertex.addOutput("MROutput", od, MROutputCommitter.class);

  OrderedPartitionedKVEdgeConfigurer edgeConf = OrderedPartitionedKVEdgeConfigurer
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName(), null).build();

  DAG dag = new DAG("WordCount");
  dag.addVertex(tokenizerVertex)
      .addVertex(summerVertex)
      .addEdge(
          new Edge(tokenizerVertex, summerVertex, edgeConf.createDefaultEdgeProperty()));
  return dag;  
}

Example #5

Source File: TestHistoryParser.java From tez with Apache License 2.0

4 votes

private String runWordCount(String tokenizerProcessor, String summationProcessor,
    String dagName, boolean withTimeline)
    throws Exception {
  //HDFS path
  Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());

  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf,
      TextInputFormat.class, inputLoc.toString()).build();

  DataSinkDescriptor dataSink =
      MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();

  Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
      tokenizerProcessor)).addDataSource(INPUT, dataSource);

  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName()).build();

  Vertex summationVertex = Vertex.create(SUMMATION,
      ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);

  // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
  DAG dag = DAG.create(dagName);
  dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(
      Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));

  TezClient tezClient = getTezClient(withTimeline);

  // Update Caller Context
  CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
  ApplicationId appId = tezClient.getAppMasterApplicationId();
  if (appId == null) {
    appId = ApplicationId.newInstance(1001l, 1);
  }
  callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
  dag.setCallerContext(callerContext);

  DAGClient client = tezClient.submitDAG(dag);
  client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
  TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);

  if (tezClient != null) {
    tezClient.stop();
  }
  return tezDAGID.toString();
}

Example #6

Source File: TopK.java From sequenceiq-samples with Apache License 2.0

4 votes

private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
        String columnIndex, String top, String numPartitions) throws IOException {

    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
            TextInputFormat.class, inputPath).build();

    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, outputPath).build();

    Vertex tokenizerVertex = Vertex.create(TOKENIZER,
            ProcessorDescriptor.create(TokenProcessor.class.getName())
                    .setUserPayload(createPayload(Integer.valueOf(columnIndex))))
            .addDataSource(INPUT, dataSource);

    int topK = Integer.valueOf(top);
    Vertex sumVertex = Vertex.create(SUM,
            ProcessorDescriptor.create(SumProcessor.class.getName())
                    .setUserPayload(createPayload(topK)), Integer.valueOf(numPartitions));

    // parallelism must be set to 1 as the writer needs to see the global picture of
    // the data set
    // multiple tasks from the writer will result in multiple list of the top K
    // elements as all task will take the partitioned data's top K element
    Vertex writerVertex = Vertex.create(WRITER,
            ProcessorDescriptor.create(Writer.class.getName())
                    .setUserPayload(createPayload(topK)), 1)
            .addDataSink(OUTPUT, dataSink);

    OrderedPartitionedKVEdgeConfig tokenSumEdge = OrderedPartitionedKVEdgeConfig
            .newBuilder(Text.class.getName(), IntWritable.class.getName(),
                    HashPartitioner.class.getName()).build();

    UnorderedKVEdgeConfig sumWriterEdge = UnorderedKVEdgeConfig
            .newBuilder(IntWritable.class.getName(), Text.class.getName()).build();

    DAG dag = DAG.create("topk");
    return dag
            .addVertex(tokenizerVertex)
            .addVertex(sumVertex)
            .addVertex(writerVertex)
            .addEdge(Edge.create(tokenizerVertex, sumVertex, tokenSumEdge.createDefaultEdgeProperty()))
            .addEdge(Edge.create(sumVertex, writerVertex, sumWriterEdge.createDefaultBroadcastEdgeProperty()));
}

Example #7

Source File: TestMRInputAMSplitGenerator.java From tez with Apache License 2.0

4 votes

private void testGroupSplitsAndSortSplits(boolean groupSplitsEnabled,
    boolean sortSplitsEnabled) throws Exception {
  Configuration conf = new Configuration();
  String[] splitLengths = new String[50];
  for (int i = 0; i < splitLengths.length; i++) {
    splitLengths[i] = Integer.toString(1000 * (i + 1));
  }
  conf.setStrings(SPLITS_LENGTHS, splitLengths);
  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(
      conf, InputFormatForTest.class).
      groupSplits(groupSplitsEnabled).sortSplits(sortSplitsEnabled).build();
  UserPayload userPayload = dataSource.getInputDescriptor().getUserPayload();

  InputInitializerContext context =
      new TezTestUtils.TezRootInputInitializerContextForTest(userPayload, new Configuration(false));
  MRInputAMSplitGenerator splitGenerator =
      new MRInputAMSplitGenerator(context);

  List<Event> events = splitGenerator.initialize();

  assertTrue(events.get(0) instanceof InputConfigureVertexTasksEvent);
  boolean shuffled = false;
  InputSplit previousIs = null;
  int numRawInputSplits = 0;
  for (int i = 1; i < events.size(); i++) {
    assertTrue(events.get(i) instanceof InputDataInformationEvent);
    InputDataInformationEvent diEvent = (InputDataInformationEvent) (events.get(i));
    assertNull(diEvent.getDeserializedUserPayload());
    assertNotNull(diEvent.getUserPayload());
    MRSplitProto eventProto = MRSplitProto.parseFrom(ByteString.copyFrom(
        diEvent.getUserPayload()));
    InputSplit is = MRInputUtils.getNewSplitDetailsFromEvent(
        eventProto, new Configuration());
    if (groupSplitsEnabled) {
      numRawInputSplits += ((TezGroupedSplit)is).getGroupedSplits().size();
      for (InputSplit inputSplit : ((TezGroupedSplit)is).getGroupedSplits()) {
        assertTrue(inputSplit instanceof InputSplitForTest);
      }
      assertTrue(((TezGroupedSplit)is).getGroupedSplits().get(0)
          instanceof InputSplitForTest);
    } else {
      numRawInputSplits++;
      assertTrue(is instanceof InputSplitForTest);
    }
    // The splits in the list returned from InputFormat has ascending
    // size in order.
    // If sortSplitsEnabled is true, MRInputAMSplitGenerator will sort the
    // splits in descending order.
    // If sortSplitsEnabled is false, MRInputAMSplitGenerator will shuffle
    // the splits.
    if (previousIs != null) {
      if (sortSplitsEnabled) {
        assertTrue(is.getLength() <= previousIs.getLength());
      } else {
        shuffled |= (is.getLength() > previousIs.getLength());
      }
    }
    previousIs = is;
  }
  assertEquals(splitLengths.length, numRawInputSplits);
  if (!sortSplitsEnabled) {
    assertTrue(shuffled);
  }
}

Example #8

Source File: MRInputBase.java From tez with Apache License 2.0

4 votes

public List<Event> initialize() throws IOException {
  getContext().requestInitialMemory(0l, null); // mandatory call
  MRRuntimeProtos.MRInputUserPayloadProto mrUserPayload =
      MRInputHelpers.parseMRInputPayload(getContext().getUserPayload());
  boolean isGrouped = mrUserPayload.getGroupingEnabled();
  Preconditions.checkArgument(mrUserPayload.hasSplits() == false,
      "Split information not expected in " + this.getClass().getName());

  Configuration conf = new JobConf(getContext().getContainerConfiguration());
  TezUtils.addToConfFromByteString(conf, mrUserPayload.getConfigurationBytes());
  this.jobConf = new JobConf(conf);
  useNewApi = this.jobConf.getUseNewMapper();
  if (isGrouped) {
    if (useNewApi) {
      jobConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR,
          org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat.class.getName());
    } else {
      jobConf.set("mapred.input.format.class",
          org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class.getName());
    }
  }


  // Add tokens to the jobConf - in case they are accessed within the RR / IF
  jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());

  TaskAttemptID taskAttemptId = new TaskAttemptID(
      new TaskID(
          Long.toString(getContext().getApplicationId().getClusterTimestamp()),
          getContext().getApplicationId().getId(), TaskType.MAP,
          getContext().getTaskIndex()),
      getContext().getTaskAttemptNumber());

  jobConf.set(MRJobConfig.TASK_ATTEMPT_ID,
      taskAttemptId.toString());
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
      getContext().getDAGAttemptNumber());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_DAG_INDEX, getContext().getDagIdentifier());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_VERTEX_INDEX, getContext().getTaskVertexIndex());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_TASK_INDEX, getContext().getTaskIndex());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_TASK_ATTEMPT_INDEX, getContext().getTaskAttemptNumber());
  jobConf.set(MRInput.TEZ_MAPREDUCE_DAG_NAME, getContext().getDAGName());
  jobConf.set(MRInput.TEZ_MAPREDUCE_VERTEX_NAME, getContext().getTaskVertexName());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_INPUT_INDEX, getContext().getInputIndex());
  jobConf.set(MRInput.TEZ_MAPREDUCE_INPUT_NAME, getContext().getSourceVertexName());
  jobConf.set(MRInput.TEZ_MAPREDUCE_APPLICATION_ID, getContext().getApplicationId().toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_UNIQUE_IDENTIFIER, getContext().getUniqueIdentifier());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_DAG_ATTEMPT_NUMBER, getContext().getDAGAttemptNumber());

  TezDAGID tezDAGID = TezDAGID.getInstance(getContext().getApplicationId(), getContext().getDagIdentifier());
  TezVertexID tezVertexID = TezVertexID.getInstance(tezDAGID, getContext().getTaskVertexIndex());
  TezTaskID tezTaskID = TezTaskID.getInstance(tezVertexID, getContext().getTaskIndex());
  TezTaskAttemptID tezTaskAttemptID = TezTaskAttemptID.getInstance(tezTaskID, getContext().getTaskAttemptNumber());
  jobConf.set(MRInput.TEZ_MAPREDUCE_DAG_ID, tezDAGID.toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_VERTEX_ID, tezVertexID.toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_TASK_ID, tezTaskID.toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_TASK_ATTEMPT_ID, tezTaskAttemptID.toString());

  this.inputRecordCounter = getContext().getCounters().findCounter(
      TaskCounter.INPUT_RECORDS_PROCESSED);


  return null;
}

Example #9

Source File: MapProcessor.java From tez with Apache License 2.0

4 votes

private NewRecordReader(MRInput in) throws IOException {
  this.in = in;
  this.reader = in.getReader();
}

Example #10

Source File: UnionExample.java From tez with Apache License 2.0

4 votes

private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {
  DAG dag = DAG.create("UnionExample");
  
  int numMaps = -1;
  Configuration inputConf = new Configuration(tezConf);
  inputConf.setBoolean("mapred.mapper.new-api", false);
  inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
  DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

  Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(
      UnionProcessor.class.getName()), 1);

  Configuration outputConf = new Configuration(tezConf);
  outputConf.setBoolean("mapred.reducer.new-api", false);
  outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
  checkerVertex.addDataSink("union", od);
  

  Configuration allPartsConf = new Configuration(tezConf);
  DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf,
      TextOutputFormat.class, outputPath + "-all-parts").build();
  checkerVertex.addDataSink("all-parts", od2);

  Configuration partsConf = new Configuration(tezConf);    
  DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf,
      TextOutputFormat.class, outputPath + "-parts").build();
  VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
  unionVertex.addDataSink("parts", od1);

  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName()).build();

  dag.addVertex(mapVertex1)
      .addVertex(mapVertex2)
      .addVertex(mapVertex3)
      .addVertex(checkerVertex)
      .addEdge(
          Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
      .addEdge(
          GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
              InputDescriptor.create(
                  ConcatenatedMergedKeyValuesInput.class.getName())));
  return dag;  
}

Example #11

Source File: FilterByWordInputProcessor.java From tez with Apache License 2.0

4 votes

@Override
public void run(Map<String, LogicalInput> _inputs,
    Map<String, LogicalOutput> _outputs) throws Exception {
  this.inputs = _inputs;
  this.outputs = _outputs;
  this.progressHelper = new ProgressHelper(this.inputs, getContext(),this.getClass().getSimpleName());
  if (_inputs.size() != 1) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single input");
  }

  if (_outputs.size() != 1) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single output");
  }
  
  for (LogicalInput input : _inputs.values()) {
    input.start();
  }
  for (LogicalOutput output : _outputs.values()) {
    output.start();
  }

  LogicalInput li = _inputs.values().iterator().next();
  if (! (li instanceof MRInput)) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with MRInput");
  }

  LogicalOutput lo = _outputs.values().iterator().next();
  if (! (lo instanceof UnorderedKVOutput)) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with OnFileUnorderedKVOutput");
  }
  progressHelper.scheduleProgressTaskService(0, 100);
  MRInputLegacy mrInput = (MRInputLegacy) li;
  mrInput.init();
  UnorderedKVOutput kvOutput = (UnorderedKVOutput) lo;

  Configuration updatedConf = mrInput.getConfigUpdates();
  Text srcFile = new Text();
  srcFile.set("UNKNOWN_FILENAME_IN_PROCESSOR");
  if (updatedConf != null) {
    String fileName = updatedConf.get(MRJobConfig.MAP_INPUT_FILE);
    if (fileName != null) {
      LOG.info("Processing file: " + fileName);
      srcFile.set(fileName);
    }
  }

  KeyValueReader kvReader = mrInput.getReader();
  KeyValueWriter kvWriter = kvOutput.getWriter();

  while (kvReader.next()) {
    Object key = kvReader.getCurrentKey();
    Object val = kvReader.getCurrentValue();

    Text valText = (Text) val;
    String readVal = valText.toString();
    if (readVal.contains(filterWord)) {
      LongWritable lineNum = (LongWritable) key;
      TextLongPair outVal = new TextLongPair(srcFile, lineNum);
      kvWriter.write(valText, outVal);
    }
  }
}

Example #12

Source File: OrderedWordCount.java From tez with Apache License 2.0

4 votes

public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
    int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException {

  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
      TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping)
        .generateSplitsInAM(!isGenerateSplitInClient).build();

  DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
      TextOutputFormat.class, outputPath).build();

  Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
      TokenProcessor.class.getName()));
  tokenizerVertex.addDataSource(INPUT, dataSource);

  // Use Text key and IntWritable value to bring counts for each word in the same partition
  // The setFromConfiguration call is optional and allows overriding the config options with
  // command line parameters.
  OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  // This vertex will be reading intermediate data via an input edge and writing intermediate data
  // via an output edge.
  Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(
      SumProcessor.class.getName()), numPartitions);
  
  // Use IntWritable key and Text value to bring all words with the same count in the same 
  // partition. The data will be ordered by count and words grouped by count. The
  // setFromConfiguration call is optional and allows overriding the config options with
  // command line parameters.
  OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(IntWritable.class.getName(), Text.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  // Use 1 task to bring all the data in one place for global sorted order. Essentially the number
  // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
  Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(
      NoOpSorter.class.getName()), 1);
  sorterVertex.addDataSink(OUTPUT, dataSink);

  // No need to add jar containing this class as assumed to be part of the tez jars.
  
  DAG dag = DAG.create(dagName);
  dag.addVertex(tokenizerVertex)
      .addVertex(summationVertex)
      .addVertex(sorterVertex)
      .addEdge(
          Edge.create(tokenizerVertex, summationVertex,
              summationEdgeConf.createDefaultEdgeProperty()))
      .addEdge(
          Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
  return dag;  
}

Example #13

Source File: JoinValidate.java From tez with Apache License 2.0

4 votes

@VisibleForTesting
DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions)
    throws IOException {
  DAG dag = DAG.create(getDagName());
  if (getDefaultExecutionContext() != null) {
    dag.setExecutionContext(getDefaultExecutionContext());
  }

  // Configuration for intermediate output - shared by Vertex1 and Vertex2
  // This should only be setting selective keys from the underlying conf. Fix after there's a
  // better mechanism to configure the IOs. The setFromConfiguration call is optional and allows
  // overriding the config options with command line parameters.
  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), NullWritable.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  Vertex lhsVertex = Vertex.create(LHS_INPUT_NAME, ProcessorDescriptor.create(
      ForwardingProcessor.class.getName())).addDataSource("lhs",
      MRInput
          .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
              lhs.toUri().toString()).groupSplits(!isDisableSplitGrouping())
              .generateSplitsInAM(!isGenerateSplitInClient()).build());
  setVertexExecutionContext(lhsVertex, getLhsExecutionContext());

  Vertex rhsVertex = Vertex.create(RHS_INPUT_NAME, ProcessorDescriptor.create(
      ForwardingProcessor.class.getName())).addDataSource("rhs",
      MRInput
          .createConfigBuilder(new Configuration(tezConf), TextInputFormat.class,
              rhs.toUri().toString()).groupSplits(!isDisableSplitGrouping())
              .generateSplitsInAM(!isGenerateSplitInClient()).build());
  setVertexExecutionContext(rhsVertex, getRhsExecutionContext());

  Vertex joinValidateVertex = Vertex.create("joinvalidate", ProcessorDescriptor.create(
      JoinValidateProcessor.class.getName()), numPartitions);
  setVertexExecutionContext(joinValidateVertex, getValidateExecutionContext());

  Edge e1 = Edge.create(lhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());
  Edge e2 = Edge.create(rhsVertex, joinValidateVertex, edgeConf.createDefaultEdgeProperty());

  dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(joinValidateVertex).addEdge(e1)
      .addEdge(e2);
  return dag;
}

Example #14

Source File: FilterByWordInputProcessor.java From incubator-tez with Apache License 2.0

4 votes

@Override
public void run(Map<String, LogicalInput> inputs,
    Map<String, LogicalOutput> outputs) throws Exception {
  
  if (inputs.size() != 1) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single input");
  }

  if (outputs.size() != 1) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with a single output");
  }
  
  for (LogicalInput input : inputs.values()) {
    input.start();
  }
  for (LogicalOutput output : outputs.values()) {
    output.start();
  }

  LogicalInput li = inputs.values().iterator().next();
  if (! (li instanceof MRInput)) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with MRInput");
  }

  LogicalOutput lo = outputs.values().iterator().next();
  if (! (lo instanceof OnFileUnorderedKVOutput)) {
    throw new IllegalStateException("FilterByWordInputProcessor processor can only work with OnFileUnorderedKVOutput");
  }

  
  
  
  MRInputLegacy mrInput = (MRInputLegacy) li;
  mrInput.init();
  OnFileUnorderedKVOutput kvOutput = (OnFileUnorderedKVOutput) lo;

  Configuration updatedConf = mrInput.getConfigUpdates();
  Text srcFile = new Text();
  srcFile.set("UNKNOWN_FILENAME_IN_PROCESSOR");
  if (updatedConf != null) {
    String fileName = updatedConf.get(MRJobConfig.MAP_INPUT_FILE);
    if (fileName != null) {
      LOG.info("Processing file: " + fileName);
      srcFile.set(fileName);
    }
  }

  KeyValueReader kvReader = mrInput.getReader();
  KeyValueWriter kvWriter = kvOutput.getWriter();

  while (kvReader.next()) {
    Object key = kvReader.getCurrentKey();
    Object val = kvReader.getCurrentValue();

    Text valText = (Text) val;
    String readVal = valText.toString();
    if (readVal.contains(filterWord)) {
      LongWritable lineNum = (LongWritable) key;
      TextLongPair outVal = new TextLongPair(srcFile, lineNum);
      kvWriter.write(valText, outVal);
    }
  }
}

Example #15

Source File: IntersectValidate.java From incubator-tez with Apache License 2.0

4 votes

private DAG createDag(TezConfiguration tezConf, Path lhs, Path rhs, int numPartitions)
    throws IOException {
  DAG dag = new DAG("IntersectValidate");

  // Configuration for src1
  Configuration lhsInputConf = new Configuration(tezConf);
  lhsInputConf.set(FileInputFormat.INPUT_DIR, lhs.toUri().toString());
  byte[] streamInputPayload = MRInput.createUserPayload(lhsInputConf,
      TextInputFormat.class.getName(), true, false);

  // Configuration for src2
  Configuration rhsInputConf = new Configuration(tezConf);
  rhsInputConf.set(FileInputFormat.INPUT_DIR, rhs.toUri().toString());
  byte[] hashInputPayload = MRInput.createUserPayload(rhsInputConf,
      TextInputFormat.class.getName(), true, false);

  // Configuration for intermediate output - shared by Vertex1 and Vertex2
  // This should only be setting selective keys from the underlying conf. Fix after there's a
  // better mechanism to configure the IOs.
  OrderedPartitionedKVEdgeConfigurer edgeConf = OrderedPartitionedKVEdgeConfigurer
      .newBuilder(Text.class.getName(), NullWritable.class.getName(),
          HashPartitioner.class.getName(), null).build();

  // Change the way resources are setup - no MRHelpers
  Vertex lhsVertex = new Vertex(LHS_INPUT_NAME, new ProcessorDescriptor(
      ForwardingProcessor.class.getName()), -1,
      MRHelpers.getMapResource(tezConf)).addInput("lhs", new InputDescriptor(
      MRInput.class.getName()).setUserPayload(streamInputPayload),
      MRInputAMSplitGenerator.class);

  Vertex rhsVertex = new Vertex(RHS_INPUT_NAME, new ProcessorDescriptor(
      ForwardingProcessor.class.getName()), -1,
      MRHelpers.getMapResource(tezConf)).addInput("rhs", new InputDescriptor(
      MRInput.class.getName()).setUserPayload(hashInputPayload),
      MRInputAMSplitGenerator.class);

  Vertex intersectValidateVertex = new Vertex("intersectvalidate",
      new ProcessorDescriptor(IntersectValidateProcessor.class.getName()),
      numPartitions, MRHelpers.getReduceResource(tezConf));

  Edge e1 = new Edge(lhsVertex, intersectValidateVertex, edgeConf.createDefaultEdgeProperty());
  Edge e2 = new Edge(rhsVertex, intersectValidateVertex, edgeConf.createDefaultEdgeProperty());

  dag.addVertex(lhsVertex).addVertex(rhsVertex).addVertex(intersectValidateVertex).addEdge(e1)
      .addEdge(e2);
  return dag;
}

Example #16

Source File: IntersectExample.java From incubator-tez with Apache License 2.0

4 votes

private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath,
    int numPartitions) throws IOException {
  DAG dag = new DAG("IntersectExample");

  // Configuration for src1
  Configuration streamInputConf = new Configuration(tezConf);
  streamInputConf.set(FileInputFormat.INPUT_DIR, streamPath.toUri().toString());
  byte[] streamInputPayload = MRInput.createUserPayload(streamInputConf,
      TextInputFormat.class.getName(), true, false);

  // Configuration for src2
  Configuration hashInputConf = new Configuration(tezConf);
  hashInputConf.set(FileInputFormat.INPUT_DIR, hashPath.toUri().toString());
  byte[] hashInputPayload = MRInput.createUserPayload(hashInputConf,
      TextInputFormat.class.getName(), true, false);

  // Configuration for intermediate output - shared by Vertex1 and Vertex2
  // This should only be setting selective keys from the underlying conf. Fix after there's a
  // better mechanism to configure the IOs.

  UnorderedPartitionedKVEdgeConfigurer edgeConf =
      UnorderedPartitionedKVEdgeConfigurer
          .newBuilder(Text.class.getName(), NullWritable.class.getName(),
              HashPartitioner.class.getName(), null).build();

  Configuration finalOutputConf = new Configuration(tezConf);
  finalOutputConf.set(FileOutputFormat.OUTDIR, outPath.toUri().toString());
  byte[] finalOutputPayload = MROutput.createUserPayload(finalOutputConf,
      TextOutputFormat.class.getName(), true);

  // Change the way resources are setup - no MRHelpers
  Vertex streamFileVertex = new Vertex("partitioner1",
      new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1,
      MRHelpers.getMapResource(tezConf)).addInput("streamfile",
      new InputDescriptor(MRInput.class.getName())
          .setUserPayload(streamInputPayload), MRInputAMSplitGenerator.class);

  Vertex hashFileVertex = new Vertex("partitioner2", new ProcessorDescriptor(
      ForwardingProcessor.class.getName()), -1,
      MRHelpers.getMapResource(tezConf)).addInput("hashfile",
      new InputDescriptor(MRInput.class.getName())
          .setUserPayload(hashInputPayload), MRInputAMSplitGenerator.class);

  Vertex intersectVertex = new Vertex("intersect", new ProcessorDescriptor(
      IntersectProcessor.class.getName()), numPartitions,
      MRHelpers.getReduceResource(tezConf)).addOutput("finalOutput",
      new OutputDescriptor(MROutput.class.getName())
          .setUserPayload(finalOutputPayload), MROutputCommitter.class);

  Edge e1 = new Edge(streamFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty());

  Edge e2 = new Edge(hashFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty());

  dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(intersectVertex)
      .addEdge(e1).addEdge(e2);
  return dag;
}

Example #17

Source File: UnionExample.java From incubator-tez with Apache License 2.0

4 votes

private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {
  DAG dag = new DAG("UnionExample");
  
  int numMaps = -1;
  Configuration inputConf = new Configuration(tezConf);
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  InputDescriptor id = new InputDescriptor(MRInput.class.getName())
      .setUserPayload(MRInput.createUserPayload(inputConf,
          TextInputFormat.class.getName(), true, true));

  Vertex mapVertex1 = new Vertex("map1", new ProcessorDescriptor(
      TokenProcessor.class.getName()),
      numMaps, MRHelpers.getMapResource(tezConf));
  mapVertex1.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex mapVertex2 = new Vertex("map2", new ProcessorDescriptor(
      TokenProcessor.class.getName()),
      numMaps, MRHelpers.getMapResource(tezConf));
  mapVertex2.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex mapVertex3 = new Vertex("map3", new ProcessorDescriptor(
      TokenProcessor.class.getName()),
      numMaps, MRHelpers.getMapResource(tezConf));
  mapVertex3.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex checkerVertex = new Vertex("checker",
      new ProcessorDescriptor(
          UnionProcessor.class.getName()),
              1, MRHelpers.getReduceResource(tezConf));

  Configuration outputConf = new Configuration(tezConf);
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  OutputDescriptor od = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        outputConf, TextOutputFormat.class.getName(), true));
  checkerVertex.addOutput("union", od, MROutputCommitter.class);

  Configuration allPartsConf = new Configuration(tezConf);
  allPartsConf.set(FileOutputFormat.OUTDIR, outputPath+"-all-parts");
  OutputDescriptor od2 = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        allPartsConf, TextOutputFormat.class.getName(), true));
  checkerVertex.addOutput("all-parts", od2, MROutputCommitter.class);

  Configuration partsConf = new Configuration(tezConf);
  partsConf.set(FileOutputFormat.OUTDIR, outputPath+"-parts");
  
  VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
  OutputDescriptor od1 = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        partsConf, TextOutputFormat.class.getName(), true));
  unionVertex.addOutput("parts", od1, MROutputCommitter.class);

  OrderedPartitionedKVEdgeConfigurer edgeConf = OrderedPartitionedKVEdgeConfigurer
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName(), null).build();

  dag.addVertex(mapVertex1)
      .addVertex(mapVertex2)
      .addVertex(mapVertex3)
      .addVertex(checkerVertex)
      .addEdge(
          new Edge(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
      .addEdge(
          new GroupInputEdge(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
              new InputDescriptor(
                  ConcatenatedMergedKeyValuesInput.class.getName())));
  return dag;  
}

Example #18

Source File: MapProcessor.java From incubator-tez with Apache License 2.0

4 votes

private NewRecordReader(MRInput in) throws IOException {
  this.in = in;
  this.reader = in.getReader();
}

Example #19

Source File: WordCount.java From tez with Apache License 2.0

4 votes

private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
    int numPartitions) throws IOException {

  // Create the descriptor that describes the input data to Tez. Using MRInput to read text 
  // data from the given input path. The TextInputFormat is used to read the text data.
  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
      TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping())
        .generateSplitsInAM(!isGenerateSplitInClient()).build();

  // Create a descriptor that describes the output data to Tez. Using MROoutput to write text
  // data to the given output path. The TextOutputFormat is used to write the text data.
  DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
      TextOutputFormat.class, outputPath).build();

  // Create a vertex that reads the data from the data source and tokenizes it using the 
  // TokenProcessor. The number of tasks that will do the work for this vertex will be decided 
  // using the information provided by the data source descriptor.
  Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
      TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);

  // Create the edge that represents the movement and semantics of data between the producer 
  // Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in 
  // parallel the tokenized data will be partitioned by word such that a given word goes to the 
  // same partition. The counts for the words should be grouped together per word. To achieve this
  // we can use an edge that contains an input/output pair that handles partitioning and grouping 
  // of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
  // edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
  // We specify the key, value and partitioner type. Here the key type is Text (for word), the 
  // value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
  // object. The edge can be configured by configuring the input, output etc individually without
  // using this helper. The setFromConfiguration call is optional and allows overriding the config
  // options with command line parameters.
  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  // Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
  // The number of tasks that do the work of this vertex depends on the number of partitions used 
  // to distribute the sum processing. In this case, its been made configurable via the 
  // numPartitions parameter.
  Vertex summationVertex = Vertex.create(SUMMATION,
      ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions)
      .addDataSink(OUTPUT, dataSink);

  // No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise 
  // we would have to add the jars for this code as local files to the vertices.
  
  // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
  DAG dag = DAG.create("WordCount");
  dag.addVertex(tokenizerVertex)
      .addVertex(summationVertex)
      .addEdge(
          Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
  return dag;  
}

Example #20

Source File: CartesianProduct.java From tez with Apache License 2.0

4 votes

private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2,
                      String inputPath3, String outputPath, boolean isPartitioned)
  throws IOException {
  Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
  // turn off groupSplit so that each input file incurs one task
  v1.addDataSource(INPUT,
    MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1)
           .groupSplits(false).build());
  Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
  v2.addDataSource(INPUT,
    MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2)
            .groupSplits(false).build());
  Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName()));
  v3.addDataSource(INPUT,
    MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3)
      .groupSplits(false).build());
  CartesianProductConfig cartesianProductConfig;
  if (isPartitioned) {
    Map<String, Integer> vertexPartitionMap = new HashMap<>();
    for (String vertex : cpSources) {
      vertexPartitionMap.put(vertex, numPartition);
    }
    cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap);
  } else {
    cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources));
  }
  UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
  Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName()));
  v4.addDataSink(OUTPUT,
    MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath)
            .build());
  v4.setVertexManagerPlugin(
    VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName())
                                 .setUserPayload(userPayload));

  EdgeManagerPluginDescriptor cpEdgeManager =
    EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
  cpEdgeManager.setUserPayload(userPayload);
  EdgeProperty cpEdgeProperty;
  if (isPartitioned) {
    UnorderedPartitionedKVEdgeConfig cpEdgeConf =
      UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(),
        IntWritable.class.getName(), CustomPartitioner.class.getName()).build();
    cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
  } else {
    UnorderedKVEdgeConfig edgeConf =
      UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
    cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
  }

  EdgeProperty broadcastEdgeProperty;
  UnorderedKVEdgeConfig broadcastEdgeConf =
    UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
  broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty();

  return DAG.create("CartesianProduct")
    .addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4)
    .addEdge(Edge.create(v1, v4, cpEdgeProperty))
    .addEdge(Edge.create(v2, v4, cpEdgeProperty))
    .addEdge(Edge.create(v3, v4, broadcastEdgeProperty));
}

Example #21