org.apache.tez.mapreduce.output.MROutput Java Examples

The following examples show how to use org.apache.tez.mapreduce.output.MROutput. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TopKDataGen.java    From sequenceiq-samples with Apache License 2.0 6 votes vote down vote up
private DAG createDag(TezConfiguration tezConf, Path outPath, long outSize, int extraColumns, int numTasks)
        throws IOException {

    long largeOutSizePerTask = outSize / numTasks;

    DAG dag = DAG.create("TopK DataGen");

    Vertex genDataVertex = Vertex.create("datagen", ProcessorDescriptor.create(
                    GenDataProcessor.class.getName()).setUserPayload(
                    UserPayload.create(ByteBuffer.wrap(GenDataProcessor.createConfiguration(largeOutSizePerTask, extraColumns)))),
            numTasks);
    genDataVertex.addDataSink(OUTPUT,
            MROutput.createConfigBuilder(new Configuration(tezConf),
                    TextOutputFormat.class, outPath.toUri().toString()).build());
    dag.addVertex(genDataVertex);

    return dag;
}
 
Example #2
Source File: PigProcessor.java    From spork with Apache License 2.0 6 votes vote down vote up
private void initializeOutputs(Map<String, LogicalOutput> outputs) throws Exception {

        for (Entry<String, LogicalOutput> entry : outputs.entrySet()) {
            LogicalOutput output = entry.getValue();
            LOG.info("Starting output " + output + " to vertex " + entry.getKey());
            output.start();
            if (output instanceof MROutput){
                MROutput mrOut = (MROutput) output;
                fileOutputs.add(mrOut);
            }
        }
        LinkedList<TezOutput> tezOuts = PlanHelper.getPhysicalOperators(execPlan, TezOutput.class);
        for (TezOutput tezOut : tezOuts){
            tezOut.attachOutputs(outputs, conf);
        }
    }
 
Example #3
Source File: POStoreTez.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public void attachOutputs(Map<String, LogicalOutput> outputs,
        Configuration conf)
        throws ExecException {
    LogicalOutput logicalOut = outputs.get(outputKey);
    if (logicalOut == null || !(logicalOut instanceof MROutput)) {
        throw new ExecException("POStoreTez only accepts MROutput. key =  "
                + getOperatorKey() + ", outputs = " + outputs);
    }
    output = (MROutput) logicalOut;
    try {
        writer = output.getWriter();
    } catch (IOException e) {
        throw new ExecException(e);
    }
}
 
Example #4
Source File: WordCount.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
@Override
public void run() throws Exception {
  Preconditions.checkArgument(getInputs().size() == 1);
  MROutput out = (MROutput) getOutputs().values().iterator().next();
  KeyValueWriter kvWriter = out.getWriter();
  KeyValuesReader kvReader = (KeyValuesReader) getInputs().values().iterator().next()
      .getReader();
  while (kvReader.next()) {
    Text word = (Text) kvReader.getCurrentKey();
    int sum = 0;
    for (Object value : kvReader.getCurrentValues()) {
      sum += ((IntWritable) value).get();
    }
    kvWriter.write(word, new IntWritable(sum));
  }
}
 
Example #5
Source File: IntersectDataGen.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
private DAG createDag(TezConfiguration tezConf, Path largeOutPath, Path smallOutPath,
    Path expectedOutputPath, int numTasks, long largeOutSize, long smallOutSize)
    throws IOException {

  long largeOutSizePerTask = largeOutSize / numTasks;
  long smallOutSizePerTask = smallOutSize / numTasks;

  DAG dag = new DAG("IntersectDataGen");

  byte[] streamOutputPayload = createPayloadForOutput(largeOutPath, tezConf);
  byte[] hashOutputPayload = createPayloadForOutput(smallOutPath, tezConf);
  byte[] expectedOutputPayload = createPayloadForOutput(expectedOutputPath, tezConf);

  Vertex genDataVertex = new Vertex("datagen", new ProcessorDescriptor(
      GenDataProcessor.class.getName()).setUserPayload(GenDataProcessor.createConfiguration(
      largeOutSizePerTask, smallOutSizePerTask)), numTasks, MRHelpers.getMapResource(tezConf));
  genDataVertex.addOutput(STREAM_OUTPUT_NAME,
      new OutputDescriptor(MROutput.class.getName()).setUserPayload(streamOutputPayload),
      MROutputCommitter.class);
  genDataVertex.addOutput(HASH_OUTPUT_NAME,
      new OutputDescriptor(MROutput.class.getName()).setUserPayload(hashOutputPayload),
      MROutputCommitter.class);
  genDataVertex.addOutput(EXPECTED_OUTPUT_NAME,
      new OutputDescriptor(MROutput.class.getName()).setUserPayload(expectedOutputPayload),
      MROutputCommitter.class);

  dag.addVertex(genDataVertex);

  return dag;
}
 
Example #6
Source File: FilterByWordOutputProcessor.java    From tez with Apache License 2.0 5 votes vote down vote up
@Override
public void run() throws Exception {
  
  if (inputs.size() != 1) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with a single input");
  }

  if (outputs.size() != 1) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with a single output");
  }

  for (LogicalInput input : inputs.values()) {
    input.start();
  }
  for (LogicalOutput output : outputs.values()) {
    output.start();
  }

  LogicalInput li = inputs.values().iterator().next();
  if (! (li instanceof UnorderedKVInput)) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with ShuffledUnorderedKVInput");
  }

  LogicalOutput lo = outputs.values().iterator().next();
  if (! (lo instanceof MROutput)) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with MROutput");
  }

  UnorderedKVInput kvInput = (UnorderedKVInput) li;
  MROutput mrOutput = (MROutput) lo;

  KeyValueReader kvReader = kvInput.getReader();
  KeyValueWriter kvWriter = mrOutput.getWriter();
  while (kvReader.next()) {
    Object key = kvReader.getCurrentKey();
    Object value = kvReader.getCurrentValue();

    kvWriter.write(key, value);
  }
}
 
Example #7
Source File: JoinDataGen.java    From tez with Apache License 2.0 5 votes vote down vote up
private DAG createDag(TezConfiguration tezConf, Path largeOutPath, Path smallOutPath,
    Path expectedOutputPath, int numTasks, long largeOutSize, long smallOutSize)
    throws IOException {

  long largeOutSizePerTask = largeOutSize / numTasks;
  long smallOutSizePerTask = smallOutSize / numTasks;

  DAG dag = DAG.create("JoinDataGen");

  Vertex genDataVertex = Vertex.create("datagen", ProcessorDescriptor.create(
      GenDataProcessor.class.getName()).setUserPayload(
      UserPayload.create(ByteBuffer.wrap(GenDataProcessor.createConfiguration(largeOutSizePerTask,
          smallOutSizePerTask)))), numTasks);
  genDataVertex.addDataSink(STREAM_OUTPUT_NAME, 
      MROutput.createConfigBuilder(new Configuration(tezConf),
          TextOutputFormat.class, largeOutPath.toUri().toString()).build());
  genDataVertex.addDataSink(HASH_OUTPUT_NAME, 
      MROutput.createConfigBuilder(new Configuration(tezConf),
          TextOutputFormat.class, smallOutPath.toUri().toString()).build());
  genDataVertex.addDataSink(EXPECTED_OUTPUT_NAME, 
      MROutput.createConfigBuilder(new Configuration(tezConf),
          TextOutputFormat.class, expectedOutputPath.toUri().toString()).build());

  dag.addVertex(genDataVertex);

  return dag;
}
 
Example #8
Source File: WordCount.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {

  Configuration inputConf = new Configuration(tezConf);
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  InputDescriptor id = new InputDescriptor(MRInput.class.getName())
      .setUserPayload(MRInput.createUserPayload(inputConf,
          TextInputFormat.class.getName(), true, true));

  Configuration outputConf = new Configuration(tezConf);
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  OutputDescriptor od = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        outputConf, TextOutputFormat.class.getName(), true));

  Vertex tokenizerVertex = new Vertex("tokenizer", new ProcessorDescriptor(
      TokenProcessor.class.getName()), -1, MRHelpers.getMapResource(tezConf));
  tokenizerVertex.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex summerVertex = new Vertex("summer",
      new ProcessorDescriptor(
          SumProcessor.class.getName()), 1, MRHelpers.getReduceResource(tezConf));
  summerVertex.addOutput("MROutput", od, MROutputCommitter.class);

  OrderedPartitionedKVEdgeConfigurer edgeConf = OrderedPartitionedKVEdgeConfigurer
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName(), null).build();

  DAG dag = new DAG("WordCount");
  dag.addVertex(tokenizerVertex)
      .addVertex(summerVertex)
      .addEdge(
          new Edge(tokenizerVertex, summerVertex, edgeConf.createDefaultEdgeProperty()));
  return dag;  
}
 
Example #9
Source File: IntersectExample.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
@Override
public void run() throws Exception {
  Preconditions.checkState(getInputs().size() == 2);
  Preconditions.checkState(getOutputs().size() == 1);
  LogicalInput streamInput = getInputs().get("partitioner1");
  LogicalInput hashInput = getInputs().get("partitioner2");
  Reader rawStreamReader = streamInput.getReader();
  Reader rawHashReader = hashInput.getReader();
  Preconditions.checkState(rawStreamReader instanceof KeyValueReader);
  Preconditions.checkState(rawHashReader instanceof KeyValueReader);
  LogicalOutput lo = getOutputs().values().iterator().next();
  Preconditions.checkState(lo instanceof MROutput);
  MROutput output = (MROutput) lo;
  KeyValueWriter writer = output.getWriter();

  KeyValueReader hashKvReader = (KeyValueReader) rawHashReader;
  Set<Text> keySet = new HashSet<Text>();
  while (hashKvReader.next()) {
    keySet.add(new Text((Text) hashKvReader.getCurrentKey()));
  }

  KeyValueReader streamKvReader = (KeyValueReader) rawStreamReader;
  while (streamKvReader.next()) {
    Text key = (Text) streamKvReader.getCurrentKey();
    if (keySet.contains(key)) {
      writer.write(key, NullWritable.get());
    }
  }

  LOG.info("Completed Processing. Trying to commit");
  while (!getContext().canCommit()) {
    Thread.sleep(100l);
  }
  output.commit();
}
 
Example #10
Source File: PigProcessor.java    From spork with Apache License 2.0 5 votes vote down vote up
private void abortOutput() {
    for (MROutput fileOutput : fileOutputs){
        try {
            fileOutput.abort();
        } catch (IOException e) {
            LOG.error("Encountered exception while aborting output", e);
        }
    }
}
 
Example #11
Source File: IntersectExample.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
private DAG createDag(TezConfiguration tezConf, Path streamPath, Path hashPath, Path outPath,
    int numPartitions) throws IOException {
  DAG dag = new DAG("IntersectExample");

  // Configuration for src1
  Configuration streamInputConf = new Configuration(tezConf);
  streamInputConf.set(FileInputFormat.INPUT_DIR, streamPath.toUri().toString());
  byte[] streamInputPayload = MRInput.createUserPayload(streamInputConf,
      TextInputFormat.class.getName(), true, false);

  // Configuration for src2
  Configuration hashInputConf = new Configuration(tezConf);
  hashInputConf.set(FileInputFormat.INPUT_DIR, hashPath.toUri().toString());
  byte[] hashInputPayload = MRInput.createUserPayload(hashInputConf,
      TextInputFormat.class.getName(), true, false);

  // Configuration for intermediate output - shared by Vertex1 and Vertex2
  // This should only be setting selective keys from the underlying conf. Fix after there's a
  // better mechanism to configure the IOs.

  UnorderedPartitionedKVEdgeConfigurer edgeConf =
      UnorderedPartitionedKVEdgeConfigurer
          .newBuilder(Text.class.getName(), NullWritable.class.getName(),
              HashPartitioner.class.getName(), null).build();

  Configuration finalOutputConf = new Configuration(tezConf);
  finalOutputConf.set(FileOutputFormat.OUTDIR, outPath.toUri().toString());
  byte[] finalOutputPayload = MROutput.createUserPayload(finalOutputConf,
      TextOutputFormat.class.getName(), true);

  // Change the way resources are setup - no MRHelpers
  Vertex streamFileVertex = new Vertex("partitioner1",
      new ProcessorDescriptor(ForwardingProcessor.class.getName()), -1,
      MRHelpers.getMapResource(tezConf)).addInput("streamfile",
      new InputDescriptor(MRInput.class.getName())
          .setUserPayload(streamInputPayload), MRInputAMSplitGenerator.class);

  Vertex hashFileVertex = new Vertex("partitioner2", new ProcessorDescriptor(
      ForwardingProcessor.class.getName()), -1,
      MRHelpers.getMapResource(tezConf)).addInput("hashfile",
      new InputDescriptor(MRInput.class.getName())
          .setUserPayload(hashInputPayload), MRInputAMSplitGenerator.class);

  Vertex intersectVertex = new Vertex("intersect", new ProcessorDescriptor(
      IntersectProcessor.class.getName()), numPartitions,
      MRHelpers.getReduceResource(tezConf)).addOutput("finalOutput",
      new OutputDescriptor(MROutput.class.getName())
          .setUserPayload(finalOutputPayload), MROutputCommitter.class);

  Edge e1 = new Edge(streamFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty());

  Edge e2 = new Edge(hashFileVertex, intersectVertex, edgeConf.createDefaultEdgeProperty());

  dag.addVertex(streamFileVertex).addVertex(hashFileVertex).addVertex(intersectVertex)
      .addEdge(e1).addEdge(e2);
  return dag;
}
 
Example #12
Source File: TopK.java    From sequenceiq-samples with Apache License 2.0 4 votes vote down vote up
private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
        String columnIndex, String top, String numPartitions) throws IOException {

    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
            TextInputFormat.class, inputPath).build();

    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
            TextOutputFormat.class, outputPath).build();

    Vertex tokenizerVertex = Vertex.create(TOKENIZER,
            ProcessorDescriptor.create(TokenProcessor.class.getName())
                    .setUserPayload(createPayload(Integer.valueOf(columnIndex))))
            .addDataSource(INPUT, dataSource);

    int topK = Integer.valueOf(top);
    Vertex sumVertex = Vertex.create(SUM,
            ProcessorDescriptor.create(SumProcessor.class.getName())
                    .setUserPayload(createPayload(topK)), Integer.valueOf(numPartitions));

    // parallelism must be set to 1 as the writer needs to see the global picture of
    // the data set
    // multiple tasks from the writer will result in multiple list of the top K
    // elements as all task will take the partitioned data's top K element
    Vertex writerVertex = Vertex.create(WRITER,
            ProcessorDescriptor.create(Writer.class.getName())
                    .setUserPayload(createPayload(topK)), 1)
            .addDataSink(OUTPUT, dataSink);

    OrderedPartitionedKVEdgeConfig tokenSumEdge = OrderedPartitionedKVEdgeConfig
            .newBuilder(Text.class.getName(), IntWritable.class.getName(),
                    HashPartitioner.class.getName()).build();

    UnorderedKVEdgeConfig sumWriterEdge = UnorderedKVEdgeConfig
            .newBuilder(IntWritable.class.getName(), Text.class.getName()).build();

    DAG dag = DAG.create("topk");
    return dag
            .addVertex(tokenizerVertex)
            .addVertex(sumVertex)
            .addVertex(writerVertex)
            .addEdge(Edge.create(tokenizerVertex, sumVertex, tokenSumEdge.createDefaultEdgeProperty()))
            .addEdge(Edge.create(sumVertex, writerVertex, sumWriterEdge.createDefaultBroadcastEdgeProperty()));
}
 
Example #13
Source File: IntersectDataGen.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
private byte[] createPayloadForOutput(Path outputPath, Configuration srcConf) throws IOException {
  Configuration conf = new Configuration(srcConf);
  conf.set(FileOutputFormat.OUTDIR, outputPath.toUri().toString());
  byte[] payload = MROutput.createUserPayload(conf, TextOutputFormat.class.getName(), true);
  return payload;
}
 
Example #14
Source File: UnionExample.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {
  DAG dag = new DAG("UnionExample");
  
  int numMaps = -1;
  Configuration inputConf = new Configuration(tezConf);
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  InputDescriptor id = new InputDescriptor(MRInput.class.getName())
      .setUserPayload(MRInput.createUserPayload(inputConf,
          TextInputFormat.class.getName(), true, true));

  Vertex mapVertex1 = new Vertex("map1", new ProcessorDescriptor(
      TokenProcessor.class.getName()),
      numMaps, MRHelpers.getMapResource(tezConf));
  mapVertex1.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex mapVertex2 = new Vertex("map2", new ProcessorDescriptor(
      TokenProcessor.class.getName()),
      numMaps, MRHelpers.getMapResource(tezConf));
  mapVertex2.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex mapVertex3 = new Vertex("map3", new ProcessorDescriptor(
      TokenProcessor.class.getName()),
      numMaps, MRHelpers.getMapResource(tezConf));
  mapVertex3.addInput("MRInput", id, MRInputAMSplitGenerator.class);

  Vertex checkerVertex = new Vertex("checker",
      new ProcessorDescriptor(
          UnionProcessor.class.getName()),
              1, MRHelpers.getReduceResource(tezConf));

  Configuration outputConf = new Configuration(tezConf);
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  OutputDescriptor od = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        outputConf, TextOutputFormat.class.getName(), true));
  checkerVertex.addOutput("union", od, MROutputCommitter.class);

  Configuration allPartsConf = new Configuration(tezConf);
  allPartsConf.set(FileOutputFormat.OUTDIR, outputPath+"-all-parts");
  OutputDescriptor od2 = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        allPartsConf, TextOutputFormat.class.getName(), true));
  checkerVertex.addOutput("all-parts", od2, MROutputCommitter.class);

  Configuration partsConf = new Configuration(tezConf);
  partsConf.set(FileOutputFormat.OUTDIR, outputPath+"-parts");
  
  VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
  OutputDescriptor od1 = new OutputDescriptor(MROutput.class.getName())
    .setUserPayload(MROutput.createUserPayload(
        partsConf, TextOutputFormat.class.getName(), true));
  unionVertex.addOutput("parts", od1, MROutputCommitter.class);

  OrderedPartitionedKVEdgeConfigurer edgeConf = OrderedPartitionedKVEdgeConfigurer
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName(), null).build();

  dag.addVertex(mapVertex1)
      .addVertex(mapVertex2)
      .addVertex(mapVertex3)
      .addVertex(checkerVertex)
      .addEdge(
          new Edge(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
      .addEdge(
          new GroupInputEdge(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
              new InputDescriptor(
                  ConcatenatedMergedKeyValuesInput.class.getName())));
  return dag;  
}
 
Example #15
Source File: CartesianProduct.java    From tez with Apache License 2.0 4 votes vote down vote up
private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2,
                      String inputPath3, String outputPath, boolean isPartitioned)
  throws IOException {
  Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
  // turn off groupSplit so that each input file incurs one task
  v1.addDataSource(INPUT,
    MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1)
           .groupSplits(false).build());
  Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
  v2.addDataSource(INPUT,
    MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2)
            .groupSplits(false).build());
  Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName()));
  v3.addDataSource(INPUT,
    MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3)
      .groupSplits(false).build());
  CartesianProductConfig cartesianProductConfig;
  if (isPartitioned) {
    Map<String, Integer> vertexPartitionMap = new HashMap<>();
    for (String vertex : cpSources) {
      vertexPartitionMap.put(vertex, numPartition);
    }
    cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap);
  } else {
    cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources));
  }
  UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
  Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName()));
  v4.addDataSink(OUTPUT,
    MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath)
            .build());
  v4.setVertexManagerPlugin(
    VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName())
                                 .setUserPayload(userPayload));

  EdgeManagerPluginDescriptor cpEdgeManager =
    EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
  cpEdgeManager.setUserPayload(userPayload);
  EdgeProperty cpEdgeProperty;
  if (isPartitioned) {
    UnorderedPartitionedKVEdgeConfig cpEdgeConf =
      UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(),
        IntWritable.class.getName(), CustomPartitioner.class.getName()).build();
    cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
  } else {
    UnorderedKVEdgeConfig edgeConf =
      UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
    cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager);
  }

  EdgeProperty broadcastEdgeProperty;
  UnorderedKVEdgeConfig broadcastEdgeConf =
    UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build();
  broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty();

  return DAG.create("CartesianProduct")
    .addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4)
    .addEdge(Edge.create(v1, v4, cpEdgeProperty))
    .addEdge(Edge.create(v2, v4, cpEdgeProperty))
    .addEdge(Edge.create(v3, v4, broadcastEdgeProperty));
}
 
Example #16
Source File: FilterByWordOutputProcessor.java    From incubator-tez with Apache License 2.0 4 votes vote down vote up
@Override
public void run(Map<String, LogicalInput> inputs,
    Map<String, LogicalOutput> outputs) throws Exception {
  
  if (inputs.size() != 1) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with a single input");
  }

  if (outputs.size() != 1) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with a single output");
  }

  for (LogicalInput input : inputs.values()) {
    input.start();
  }
  for (LogicalOutput output : outputs.values()) {
    output.start();
  }

  LogicalInput li = inputs.values().iterator().next();
  if (! (li instanceof ShuffledUnorderedKVInput)) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with ShuffledUnorderedKVInput");
  }

  LogicalOutput lo = outputs.values().iterator().next();
  if (! (lo instanceof MROutput)) {
    throw new IllegalStateException("FilterByWordOutputProcessor processor can only work with MROutput");
  }

  ShuffledUnorderedKVInput kvInput = (ShuffledUnorderedKVInput) li;
  MROutput mrOutput = (MROutput) lo;

  KeyValueReader kvReader = kvInput.getReader();
  KeyValueWriter kvWriter = mrOutput.getWriter();
  while (kvReader.next()) {
    Object key = kvReader.getCurrentKey();
    Object value = kvReader.getCurrentValue();

    kvWriter.write(key, value);
  }
  if (processorContext.canCommit()) {
    mrOutput.commit();
  } else {
    mrOutput.abort();
  }
}
 
Example #17
Source File: WordCount.java    From tez with Apache License 2.0 4 votes vote down vote up
private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
    int numPartitions) throws IOException {

  // Create the descriptor that describes the input data to Tez. Using MRInput to read text 
  // data from the given input path. The TextInputFormat is used to read the text data.
  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
      TextInputFormat.class, inputPath).groupSplits(!isDisableSplitGrouping())
        .generateSplitsInAM(!isGenerateSplitInClient()).build();

  // Create a descriptor that describes the output data to Tez. Using MROoutput to write text
  // data to the given output path. The TextOutputFormat is used to write the text data.
  DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
      TextOutputFormat.class, outputPath).build();

  // Create a vertex that reads the data from the data source and tokenizes it using the 
  // TokenProcessor. The number of tasks that will do the work for this vertex will be decided 
  // using the information provided by the data source descriptor.
  Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
      TokenProcessor.class.getName())).addDataSource(INPUT, dataSource);

  // Create the edge that represents the movement and semantics of data between the producer 
  // Tokenizer vertex and the consumer Summation vertex. In order to perform the summation in 
  // parallel the tokenized data will be partitioned by word such that a given word goes to the 
  // same partition. The counts for the words should be grouped together per word. To achieve this
  // we can use an edge that contains an input/output pair that handles partitioning and grouping 
  // of key value data. We use the helper OrderedPartitionedKVEdgeConfig to create such an
  // edge. Internally, it sets up matching Tez inputs and outputs that can perform this logic.
  // We specify the key, value and partitioner type. Here the key type is Text (for word), the 
  // value type is IntWritable (for count) and we using a hash based partitioner. This is a helper
  // object. The edge can be configured by configuring the input, output etc individually without
  // using this helper. The setFromConfiguration call is optional and allows overriding the config
  // options with command line parameters.
  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  // Create a vertex that reads the tokenized data and calculates the sum using the SumProcessor.
  // The number of tasks that do the work of this vertex depends on the number of partitions used 
  // to distribute the sum processing. In this case, its been made configurable via the 
  // numPartitions parameter.
  Vertex summationVertex = Vertex.create(SUMMATION,
      ProcessorDescriptor.create(SumProcessor.class.getName()), numPartitions)
      .addDataSink(OUTPUT, dataSink);

  // No need to add jar containing this class as assumed to be part of the Tez jars. Otherwise 
  // we would have to add the jars for this code as local files to the vertices.
  
  // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
  DAG dag = DAG.create("WordCount");
  dag.addVertex(tokenizerVertex)
      .addVertex(summationVertex)
      .addEdge(
          Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
  return dag;  
}
 
Example #18
Source File: OrderedWordCount.java    From tez with Apache License 2.0 4 votes vote down vote up
public static DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath,
    int numPartitions, boolean disableSplitGrouping, boolean isGenerateSplitInClient, String dagName) throws IOException {

  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(new Configuration(tezConf),
      TextInputFormat.class, inputPath).groupSplits(!disableSplitGrouping)
        .generateSplitsInAM(!isGenerateSplitInClient).build();

  DataSinkDescriptor dataSink = MROutput.createConfigBuilder(new Configuration(tezConf),
      TextOutputFormat.class, outputPath).build();

  Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
      TokenProcessor.class.getName()));
  tokenizerVertex.addDataSource(INPUT, dataSource);

  // Use Text key and IntWritable value to bring counts for each word in the same partition
  // The setFromConfiguration call is optional and allows overriding the config options with
  // command line parameters.
  OrderedPartitionedKVEdgeConfig summationEdgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  // This vertex will be reading intermediate data via an input edge and writing intermediate data
  // via an output edge.
  Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(
      SumProcessor.class.getName()), numPartitions);
  
  // Use IntWritable key and Text value to bring all words with the same count in the same 
  // partition. The data will be ordered by count and words grouped by count. The
  // setFromConfiguration call is optional and allows overriding the config options with
  // command line parameters.
  OrderedPartitionedKVEdgeConfig sorterEdgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(IntWritable.class.getName(), Text.class.getName(),
          HashPartitioner.class.getName())
      .setFromConfiguration(tezConf)
      .build();

  // Use 1 task to bring all the data in one place for global sorted order. Essentially the number
  // of partitions is 1. So the NoOpSorter can be used to produce the globally ordered output
  Vertex sorterVertex = Vertex.create(SORTER, ProcessorDescriptor.create(
      NoOpSorter.class.getName()), 1);
  sorterVertex.addDataSink(OUTPUT, dataSink);

  // No need to add jar containing this class as assumed to be part of the tez jars.
  
  DAG dag = DAG.create(dagName);
  dag.addVertex(tokenizerVertex)
      .addVertex(summationVertex)
      .addVertex(sorterVertex)
      .addEdge(
          Edge.create(tokenizerVertex, summationVertex,
              summationEdgeConf.createDefaultEdgeProperty()))
      .addEdge(
          Edge.create(summationVertex, sorterVertex, sorterEdgeConf.createDefaultEdgeProperty()));
  return dag;  
}
 
Example #19
Source File: UnionExample.java    From tez with Apache License 2.0 4 votes vote down vote up
private DAG createDAG(FileSystem fs, TezConfiguration tezConf,
    Map<String, LocalResource> localResources, Path stagingDir,
    String inputPath, String outputPath) throws IOException {
  DAG dag = DAG.create("UnionExample");
  
  int numMaps = -1;
  Configuration inputConf = new Configuration(tezConf);
  inputConf.setBoolean("mapred.mapper.new-api", false);
  inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
  inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
  MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
  DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();

  Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(
      TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);

  Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(
      UnionProcessor.class.getName()), 1);

  Configuration outputConf = new Configuration(tezConf);
  outputConf.setBoolean("mapred.reducer.new-api", false);
  outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
  outputConf.set(FileOutputFormat.OUTDIR, outputPath);
  DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
  checkerVertex.addDataSink("union", od);
  

  Configuration allPartsConf = new Configuration(tezConf);
  DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf,
      TextOutputFormat.class, outputPath + "-all-parts").build();
  checkerVertex.addDataSink("all-parts", od2);

  Configuration partsConf = new Configuration(tezConf);    
  DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf,
      TextOutputFormat.class, outputPath + "-parts").build();
  VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
  unionVertex.addDataSink("parts", od1);

  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName()).build();

  dag.addVertex(mapVertex1)
      .addVertex(mapVertex2)
      .addVertex(mapVertex3)
      .addVertex(checkerVertex)
      .addEdge(
          Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty()))
      .addEdge(
          GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(),
              InputDescriptor.create(
                  ConcatenatedMergedKeyValuesInput.class.getName())));
  return dag;  
}
 
Example #20
Source File: TestHistoryParser.java    From tez with Apache License 2.0 4 votes vote down vote up
private String runWordCount(String tokenizerProcessor, String summationProcessor,
    String dagName, boolean withTimeline)
    throws Exception {
  //HDFS path
  Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());

  DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf,
      TextInputFormat.class, inputLoc.toString()).build();

  DataSinkDescriptor dataSink =
      MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();

  Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(
      tokenizerProcessor)).addDataSource(INPUT, dataSource);

  OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
      .newBuilder(Text.class.getName(), IntWritable.class.getName(),
          HashPartitioner.class.getName()).build();

  Vertex summationVertex = Vertex.create(SUMMATION,
      ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);

  // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
  DAG dag = DAG.create(dagName);
  dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(
      Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));

  TezClient tezClient = getTezClient(withTimeline);

  // Update Caller Context
  CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
  ApplicationId appId = tezClient.getAppMasterApplicationId();
  if (appId == null) {
    appId = ApplicationId.newInstance(1001l, 1);
  }
  callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
  dag.setCallerContext(callerContext);

  DAGClient client = tezClient.submitDAG(dag);
  client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
  TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);

  if (tezClient != null) {
    tezClient.stop();
  }
  return tezDAGID.toString();
}
 
Example #21
Source File: MRHelpers.java    From incubator-tez with Apache License 2.0 2 votes vote down vote up
/**
 * Convenience method to add an MR Output to the specified vertex. The name of
 * the Output is "MROutput" </p>
 * 
 * This should only be called for one vertex in a DAG
 * 
 * @param vertex
 * @param userPayload
 */
public static void addMROutput(Vertex vertex, byte[] userPayload) {
  OutputDescriptor od = new OutputDescriptor(MROutput.class.getName())
      .setUserPayload(userPayload);
  vertex.addOutput("MROutput", od, MROutputCommitter.class);
}