org.apache.crunch.PipelineResult Java Examples

The following examples show how to use org.apache.crunch.PipelineResult. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JoinFilterExampleCrunch.java    From hadoop-arch-book with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }
 
Example #2
Source File: StagingToPersistent.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  final long startOfToday = startOfDay();

  // the destination dataset
  Dataset<Record> persistent = Datasets.load(
      "dataset:file:/tmp/data/logs", Record.class);

  // the source: anything before today in the staging area
  Dataset<Record> staging = Datasets.load(
      "dataset:file:/tmp/data/logs_staging", Record.class);
  View<Record> ready = staging.toBefore("timestamp", startOfToday);

  ReadableSource<Record> source = CrunchDatasets.asSource(ready);

  PCollection<Record> stagedLogs = read(source);

  getPipeline().write(stagedLogs,
      CrunchDatasets.asTarget(persistent), Target.WriteMode.APPEND);

  PipelineResult result = run();

  if (result.succeeded()) {
    // remove the source data partition from staging
    ready.deleteAll();
    return 0;
  } else {
    return 1;
  }
}
 
Example #3
Source File: WordCount.java    From tutorials with MIT License 5 votes vote down vote up
public int run(String[] args) throws Exception {

        if (args.length != 2) {
            System.err.println("Usage: hadoop jar crunch-1.0.0-SNAPSHOT-job.jar" + " [generic options] input output");
            System.err.println();
            GenericOptionsParser.printGenericCommandUsage(System.err);
            return 1;
        }

        String inputPath = args[0];
        String outputPath = args[1];

        // Create an object to coordinate pipeline creation and execution.
        Pipeline pipeline = new MRPipeline(WordCount.class, getConf());

        // Reference a given text file as a collection of Strings.
        PCollection<String> lines = pipeline.readTextFile(inputPath);

        // Define a function that splits each line in a PCollection of Strings into
        // a PCollection made up of the individual words in the file.
        // The second argument sets the serialization format.
        PCollection<String> words = lines.parallelDo(new Tokenizer(), Writables.strings());

        // Take the collection of words and remove known stop words.
        PCollection<String> noStopWords = words.filter(new StopWordFilter());

        // The count method applies a series of Crunch primitives and returns
        // a map of the unique words in the input PCollection to their counts.
        PTable<String, Long> counts = noStopWords.count();

        // Instruct the pipeline to write the resulting counts to a text file.
        pipeline.writeTextFile(counts, outputPath);

        // Execute the pipeline as a MapReduce.
        PipelineResult result = pipeline.done();

        return result.succeeded() ? 0 : 1;
    }
 
Example #4
Source File: LegacyHdfs2Cass.java    From hdfs2cass with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Pipeline pipeline = new MRPipeline(LegacyHdfs2Cass.class, getConf());

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  // Read records from Avro files in inputFolder
  PCollection<ByteBuffer> records =
      pipeline.read(From.avroFile(inputList(input), Avros.records(ByteBuffer.class)));

  // Transform the input
  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new LegacyHdfsToThrift(), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
        // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new LegacyHdfsToCQL(), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}
 
Example #5
Source File: InputFormatImportCommand.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Runs a task with the given {@link ClassLoader} as the context loader.
 *
 * @param task a {@link TransformTask}
 * @param loader a {@link ClassLoader}
 * @return the result of {@link TransformTask#run}
 * @throws IOException if the task throws an IOException
 * @throws InterruptedException if the task execution is interrupted
 */
private static PipelineResult runTaskWithClassLoader(
    final TransformTask task, final ClassLoader loader)
    throws IOException, InterruptedException {
  RunnableFuture<PipelineResult> future = new FutureTask<PipelineResult>(
      new Callable<PipelineResult>() {
        @Override
        public PipelineResult call() throws Exception {
          return task.run();
        }
      });

  Executors.newSingleThreadExecutor(
      new ThreadFactory() {
        @Override
        public Thread newThread(Runnable r) {
          Thread taskThread = new Thread(r, "transform-task");
          taskThread.setContextClassLoader(loader);
          return taskThread;
        }
      }).execute(future);

  try {
    return future.get();
  } catch (ExecutionException e) {
    Throwables.propagateIfInstanceOf(e.getCause(), IOException.class);
    throw Throwables.propagate(e.getCause());
  }
}
 
Example #6
Source File: CompactCommand.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets.size() == 1,
      "Cannot compact multiple datasets");

  String uriOrName = datasets.get(0);
  View<Record> view = load(uriOrName, Record.class);

  if (isDatasetOrViewUri(uriOrName)) {
    Preconditions.checkArgument(viewMatches(view.getUri(), uriOrName),
        "Resolved view does not match requested view: " + view.getUri());
  }

  CompactionTask task = new CompactionTask<Record>(view);

  task.setConf(getConf());

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Compacted {} records in \"{}\"",
        task.getCount(), uriOrName);
    return 0;
  } else {
    return 1;
  }
}
 
Example #7
Source File: Hdfs2Cass.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  new JCommander(this, args);

  URI outputUri = URI.create(output);

  // Our crunch job is a MapReduce job
  Configuration conf = getConf();
  conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, Boolean.FALSE);
  conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, Boolean.FALSE);
  Pipeline pipeline = new MRPipeline(Hdfs2Cass.class, conf);

  // Parse & fetch info about target Cassandra cluster
  CassandraParams params = CassandraParams.parse(outputUri);

  PCollection<GenericRecord> records =
      ((PCollection<GenericRecord>)(PCollection) pipeline.read(From.avroFile(inputList(input))));

  String protocol = outputUri.getScheme();
  if (protocol.equalsIgnoreCase("thrift")) {
    records
        // First convert ByteBuffers to ThriftRecords
        .parallelDo(new AvroToThrift(rowkey, timestamp, ttl, ignore), ThriftRecord.PTYPE)
        // Then group the ThriftRecords in preparation for writing them
        .parallelDo(new ThriftRecord.AsPair(), ThriftRecord.AsPair.PTYPE)
        .groupByKey(params.createGroupingOptions())
         // Finally write the ThriftRecords to Cassandra
        .write(new ThriftTarget(outputUri, params));
  }
  else if (protocol.equalsIgnoreCase("cql")) {
    records
        // In case of CQL, convert ByteBuffers to CQLRecords
        .parallelDo(new AvroToCQL(rowkey, timestamp, ttl, ignore), CQLRecord.PTYPE)
        .by(params.getKeyFn(), Avros.bytes())
        .groupByKey(params.createGroupingOptions())
        .write(new CQLTarget(outputUri, params));
  }

  // Execute the pipeline
  PipelineResult result = pipeline.done();
  return result.succeeded() ? 0 : 1;
}
 
Example #8
Source File: CompactionTask.java    From kite with Apache License 2.0 4 votes vote down vote up
public PipelineResult run() throws IOException {
  return task.run();
}
 
Example #9
Source File: TransformTask.java    From kite with Apache License 2.0 4 votes vote down vote up
public PipelineResult run() throws IOException {
  boolean isLocal = (isLocal(from.getDataset()) || isLocal(to.getDataset()));
  if (isLocal) {
    // copy to avoid making changes to the caller's configuration
    Configuration conf = new Configuration(getConf());
    conf.set("mapreduce.framework.name", "local");
    setConf(conf);
  }

  if (isHive(from) || isHive(to)) {
    setConf(addHiveDelegationToken(getConf()));

    // add jars needed for metastore interaction to the classpath
    if (!isLocal) {
      Class<?> fb303Class, thriftClass;
      try {
        // attempt to use libfb303 and libthrift 0.9.2 when async was added
        fb303Class = Class.forName(
            "com.facebook.fb303.FacebookService.AsyncProcessor");
        thriftClass = Class.forName(
            "org.apache.thrift.TBaseAsyncProcessor");
      } catch (ClassNotFoundException e) {
        try {
          // fallback to 0.9.0 or earlier
          fb303Class = Class.forName(
              "com.facebook.fb303.FacebookBase");
          thriftClass = Class.forName(
              "org.apache.thrift.TBase");
        } catch (ClassNotFoundException real) {
          throw new DatasetOperationException(
              "Cannot find thrift dependencies", real);
        }
      }

      TaskUtil.configure(getConf())
          .addJarForClass(Encoder.class) // commons-codec
          .addJarForClass(Log.class) // commons-logging
          .addJarForClass(CompressorInputStream.class) // commons-compress
          .addJarForClass(ApiAdapter.class) // datanucleus-core
          .addJarForClass(JDOAdapter.class) // datanucleus-api-jdo
          .addJarForClass(SQLQuery.class) // datanucleus-rdbms
          .addJarForClass(JDOHelper.class) // jdo-api
          .addJarForClass(Transaction.class) // jta
          .addJarForClass(fb303Class) // libfb303
          .addJarForClass(thriftClass) // libthrift
          .addJarForClass(HiveMetaStore.class) // hive-metastore
          .addJarForClass(HiveConf.class); // hive-exec
    }
  }

  PType<T> toPType = ptype(to);
  MapFn<T, T> validate = new CheckEntityClass<T>(to.getType());

  Pipeline pipeline = new MRPipeline(getClass(), getConf());

  PCollection<T> collection = pipeline.read(CrunchDatasets.asSource(from))
      .parallelDo(transform, toPType).parallelDo(validate, toPType);

  if (compact) {
    // the transform must be run before partitioning
    collection = CrunchDatasets.partition(collection, to, numWriters, numPartitionWriters);
  }

  pipeline.write(collection, CrunchDatasets.asTarget(to), mode);

  PipelineResult result = pipeline.done();

  StageResult sr = Iterables.getFirst(result.getStageResults(), null);
  if (sr != null && MAP_INPUT_RECORDS != null) {
    this.count = sr.getCounterValue(MAP_INPUT_RECORDS);
  }

  return result;
}
 
Example #10
Source File: CopyCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<GenericRecord> dest = load(datasets.get(1));
  View<GenericRecord> source = load(datasets.get(0))
      .asSchema(dest.getSchema());

  CopyTask task = new CopyTask<GenericRecord>(source, dest);

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}
 
Example #11
Source File: JSONImportCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 2,
      "JSON path and target dataset name are required.");

  Path source = qualifiedPath(targets.get(0));
  FileSystem sourceFS = source.getFileSystem(getConf());
  Preconditions.checkArgument(sourceFS.exists(source),
      "JSON path does not exist: " + source);

  String dataset = targets.get(1);

  View<Record> target = load(dataset, Record.class);
  Schema datasetSchema = target.getDataset().getDescriptor().getSchema();

  DatasetDescriptor jsonDescriptor = new DatasetDescriptor.Builder()
      .location(source.toUri())
      .schema(ColumnMappingParser.removeEmbeddedMapping(
          PartitionStrategyParser.removeEmbeddedStrategy(datasetSchema)))
      .format("json")
      .build();

  TemporaryFileSystemDatasetRepository repo =
      new TemporaryFileSystemDatasetRepository(getConf(),
          // ensure the same FS as the file source is used
          sourceFS.makeQualified(new Path("/tmp/" + UUID.randomUUID().toString())),
          target.getDataset().getNamespace(),
          UUID.randomUUID().toString());

  try {
    FileSystemDataset<Record> jsonDataset =
        (FileSystemDataset) repo.create("import", "json", jsonDescriptor);

    Iterator<Path> iter = jsonDataset.pathIterator().iterator();
    Preconditions.checkArgument(iter.hasNext(),
        "JSON path has no data files: " + source);

    TaskUtil.configure(getConf()).addJars(jars);

    TransformTask task;
    if (transform != null) {
      DoFn<Record, Record> transformFn;
      try {
        DynConstructors.Ctor<DoFn<Record, Record>> ctor =
            new DynConstructors.Builder(DoFn.class)
                .loader(loaderForJars(jars))
                .impl(transform)
                .buildChecked();
        transformFn = ctor.newInstance();
      } catch (NoSuchMethodException e) {
        throw new DatasetException(
            "Cannot find no-arg constructor for class: " + transform, e);
      }
      task = new TransformTask<Record, Record>(
          jsonDataset, target, transformFn);
    } else {
      task = new CopyTask<Record>(jsonDataset, target);
    }

    task.setConf(getConf());

    if (noCompaction) {
      task.noCompaction();
    }

    if (numWriters >= 0) {
      task.setNumWriters(numWriters);
    }

    if (filesPerPartition > 0) {
      task.setFilesPerPartition(filesPerPartition);
    }

    if (overwrite) {
      task.setWriteMode(Target.WriteMode.OVERWRITE);
    }

    PipelineResult result = task.run();

    if (result.succeeded()) {
      long count = task.getCount();
      if (count > 0) {
        console.info("Added {} records to \"{}\"", count, dataset);
      }
      return 0;
    } else {
      return 1;
    }
  } finally {
    // clean up the temporary repository
    repo.delete();
  }
}
 
Example #12
Source File: TransformCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public int run() throws IOException {
  Preconditions.checkArgument(datasets != null && datasets.size() > 1,
      "Source and target datasets are required");
  Preconditions.checkArgument(datasets.size() == 2,
      "Cannot copy multiple datasets");

  View<Record> source = load(datasets.get(0), Record.class);
  View<Record> dest = load(datasets.get(1), Record.class);

  TaskUtil.configure(getConf()).addJars(jars);

  TransformTask task;
  if (transform != null) {
    DoFn<Record, Record> transformFn;
    try {
      DynConstructors.Ctor<DoFn<Record, Record>> ctor =
          new DynConstructors.Builder(DoFn.class)
              .loader(loaderForJars(jars))
              .impl(transform)
              .buildChecked();
      transformFn = ctor.newInstance();
    } catch (NoSuchMethodException e) {
      throw new DatasetException(
          "Cannot find no-arg constructor for class: " + transform, e);
    }
    task = new TransformTask<Record, Record>(source, dest, transformFn);
  } else {
    task = new CopyTask<Record>(source, dest);
  }

  task.setConf(getConf());

  if (noCompaction) {
    task.noCompaction();
  }

  if (numWriters >= 0) {
    task.setNumWriters(numWriters);
  }

  if (filesPerPartition > 0) {
    task.setFilesPerPartition(filesPerPartition);
  }

  if (overwrite) {
    task.setWriteMode(Target.WriteMode.OVERWRITE);
  }

  PipelineResult result = task.run();

  if (result.succeeded()) {
    console.info("Added {} records to \"{}\"",
        task.getCount(), datasets.get(1));
    return 0;
  } else {
    return 1;
  }
}