org.apache.crunch.Pair Java Examples

The following examples show how to use org.apache.crunch.Pair. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JoinFilterExampleCrunch.java    From hadoop-arch-book with Apache License 2.0 5 votes vote down vote up
@Override
public Pair<Long, Pair<Long, Integer>> map(String input) {
  String[] cells = StringUtils.split(input.toString(), "|");

  Pair<Long, Integer> valuePair = new Pair<Long, Integer>(
      Long.parseLong(cells[FOO_ID_INX]),
      Integer.parseInt(cells[FOO_VALUE_INX]));

  return new Pair<Long, Pair<Long, Integer>>(
      Long.parseLong(cells[FOO_BAR_ID_INX]), valuePair);
}
 
Example #2
Source File: JoinFilterExampleCrunch.java    From hadoop-arch-book with Apache License 2.0 5 votes vote down vote up
public int run(String[] args) throws Exception {

    String fooInputPath = args[0];
    String barInputPath = args[1];
    String outputPath = args[2];
    int fooValMax = Integer.parseInt(args[3]);
    int joinValMax = Integer.parseInt(args[4]);
    int numberOfReducers = Integer.parseInt(args[5]);

    Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1>
    
    PCollection<String> fooLines = pipeline.readTextFile(fooInputPath);  //<2>
    PCollection<String> barLines = pipeline.readTextFile(barInputPath);

    PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo(  //<3>
        new FooIndicatorFn(),
        Avros.tableOf(Avros.longs(),
        Avros.pairs(Avros.longs(), Avros.ints())));

    fooTable = fooTable.filter(new FooFilter(fooValMax));  //<4>

    PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(),
        Avros.tableOf(Avros.longs(), Avros.ints()));

    DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy =   //<5>
        new DefaultJoinStrategy
          <Long, Pair<Long, Integer>, Integer>
          (numberOfReducers);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6>
        .join(fooTable, barTable, JoinType.INNER_JOIN);

    PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax));

    filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7>

    PipelineResult result = pipeline.done();

    return result.succeeded() ? 0 : 1;
  }
 
Example #3
Source File: JoinFilterExampleCrunch.java    From hadoop-arch-book with Apache License 2.0 5 votes vote down vote up
@Override
public Pair<Long, Integer> map(String input) {
  String[] cells = StringUtils.split(input.toString(), "|");

  return new Pair<Long, Integer>(Long.parseLong(cells[BAR_ID_INX]),
      Integer.parseInt(cells[BAR_VALUE_INX]));
}
 
Example #4
Source File: CreateSessions.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public void process(
    Pair<String, Iterable<StandardEvent>> keyAndEvents,
    Emitter<Session> emitter) {
  final Iterator<StandardEvent> events = keyAndEvents.second().iterator();
  if (!events.hasNext()) {
    return;
  }

  // Initialize the values needed to create a session for this group
  final StandardEvent firstEvent = events.next();
  long startTime = firstEvent.getTimestamp();
  long endTime = firstEvent.getTimestamp();
  int numEvents = 1;

  // Inspect each event and keep track of start time, end time, and count
  while (events.hasNext()) {
    final StandardEvent event = events.next();
    startTime = Math.min(startTime, event.getTimestamp());
    endTime = Math.max(endTime, event.getTimestamp());
    numEvents += 1;
  }

  // Create a session. Use the first event for fields that do not change
  emitter.emit(Session.newBuilder()             // same on all events:
      .setUserId(firstEvent.getUserId())        // the user id (grouped by)
      .setSessionId(firstEvent.getSessionId())  // session id (grouped by)
      .setIp(firstEvent.getIp())                // the source IP address
      .setStartTimestamp(startTime)
      .setDuration(endTime - startTime)
      .setSessionEventCount(numEvents)
      .build());
}
 
Example #5
Source File: CrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<GenericData.Record, Integer> map(E entity) {
  int marker = count % numPartitionWriters;
  count += 1;
  return Pair.<GenericData.Record, Integer>of(key.reuseFor(entity, provided, accessor), marker);
}
 
Example #6
Source File: CrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public void process(E entity, Emitter<Pair<E, Void>> emitter) {
  emitter.emit(Pair.of(entity, (Void) null));
}
 
Example #7
Source File: CrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
/**
 * Partitions {@code collection} to be stored efficiently in {@code View}.
 * <p>
 * This restructures the parallel collection so that all of the entities that
 * will be stored in a given partition will be evenly distributed across a specified
 * {@code numPartitionWriters}.
 * <p>
 * If the dataset is not partitioned, then this will structure all of the
 * entities to produce a number of files equal to {@code numWriters}.
 *
 * @param collection a collection of entities
 * @param view a {@link View} of a dataset to partition the collection for
 * @param numWriters the number of writers that should be used
 * @param numPartitionWriters the number of writers data for a single partition will be distributed across
 * @param <E> the type of entities in the collection and underlying dataset
 * @return an equivalent collection of entities partitioned for the view
 * @see #partition(PCollection, View)
 *
 * @since 1.1.0
 */
public static <E> PCollection<E> partition(PCollection<E> collection,
                                           View<E> view,
                                           int numWriters, int numPartitionWriters) {
  //ensure the number of writers is honored whether it is per partition or total.
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  if (descriptor.isPartitioned()) {
    GetStorageKey<E> getKey = new GetStorageKey<E>(view, numPartitionWriters);
    PTable<Pair<GenericData.Record, Integer>, E> table = collection
        .by(getKey, Avros.pairs(Avros.generics(getKey.schema()), Avros.ints()));
    PGroupedTable<Pair<GenericData.Record, Integer>, E> grouped =
        numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
    return grouped.ungroup().values();
  } else {
    return partition(collection, numWriters);
  }
}
 
Example #8
Source File: ThriftConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Collection<Mutation> outputValue(final Pair<ByteBuffer, Collection<Mutation>> value) {
  return value.second();
}
 
Example #9
Source File: ThriftConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public ByteBuffer outputKey(final Pair<ByteBuffer, Collection<Mutation>> value) {
  return value.first();
}
 
Example #10
Source File: ThriftConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<ByteBuffer, Iterable<Collection<Mutation>>> convertIterableInput(final ByteBuffer key, final Iterable<Collection<Mutation>> value) {
  return Pair.of(key, value);
}
 
Example #11
Source File: ThriftConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<ByteBuffer, Collection<Mutation>> convertInput(final ByteBuffer key, final Collection<Mutation> value) {
  return Pair.of(key, value);
}
 
Example #12
Source File: ThriftRecord.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<ByteBuffer, Collection<Mutation>> map(final ThriftRecord input) {
  return input.asPair();
}
 
Example #13
Source File: ThriftRecord.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
public Pair<ByteBuffer, Collection<Mutation>> asPair() {
  Collection<Mutation> collection = values;
  return Pair.of(key, collection);
}
 
Example #14
Source File: CQLConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public CQLRecord outputValue(final Pair<ByteBuffer, CQLRecord> value) {
  return value.second();
}
 
Example #15
Source File: CQLConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public ByteBuffer outputKey(final Pair<ByteBuffer, CQLRecord> value) {
  return value.first();
}
 
Example #16
Source File: CQLConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<ByteBuffer, Iterable<CQLRecord>> convertIterableInput(
    final ByteBuffer k,
    final Iterable<CQLRecord> v) {
  return Pair.of(k, v);
}
 
Example #17
Source File: CQLConverter.java    From hdfs2cass with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<ByteBuffer, CQLRecord> convertInput(final ByteBuffer k, final CQLRecord v) {
  return Pair.of(k, v);
}
 
Example #18
Source File: JoinFilterExampleCrunch.java    From hadoop-arch-book with Apache License 2.0 4 votes vote down vote up
@Override
public boolean accept(Pair<Long, Pair<Pair<Long, Integer>, Integer>> input) {
  return input.second().first().second() + input.second().second() <= joinValMax;
}
 
Example #19
Source File: JoinFilterExampleCrunch.java    From hadoop-arch-book with Apache License 2.0 4 votes vote down vote up
@Override
public boolean accept(Pair<Long, Pair<Long, Integer>> input) {
  return input.second().second() <= fooValMax;
}