org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MusicBrainzTransforms.java    From bigquery-etl-dataflow-sample with Apache License 2.0 6 votes vote down vote up
private static PCollection<KV<Long, CoGbkResult>> group(String name,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> first,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> second,
                                                        TupleTag<MusicBrainzDataObject> firstTag,
                                                        TupleTag<MusicBrainzDataObject> secondTag
) {
  final CoGroupByKey<Long> grouper = CoGroupByKey.create();


  PCollection<KV<Long, CoGbkResult>> joinedResult;

  try {
    joinedResult = KeyedPCollectionTuple
                    .of(firstTag, first)
                    .and(secondTag, second)
                    .apply("joinResult_"+name,
                           CoGroupByKey.<Long>create());
  } catch (Exception e) {
    logger.error("exception grouping.", e);
    return null;
  }
  return joinedResult;
}
 
Example #2
Source File: MultinomialLogisticRegression.java    From nemo with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
      .apply(ParDo.of(
          new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
      .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
      .of(gradientTag, gradient)
      .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
      coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
      .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}
 
Example #3
Source File: VerifyBamId.java    From dataflow-java with Apache License 2.0 6 votes vote down vote up
/**
 * Filter, pile up, and sample reads, then join against reference statistics.
 *
 * @param reads A PCollection of reads
 * @param samplingFraction Fraction of reads to keep
 * @param samplingPrefix A prefix used in generating hashes used in sampling
 * @param refCounts A PCollection mapping position to counts of alleles in
 *   a reference population.
 * @return A PCollection mapping Position to a ReadCounts proto
 */
static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads,
    double samplingFraction, String samplingPrefix,
    PCollection<KV<Position, AlleleFreq>> refFreq) {
  // Runs filters on input Reads, splits into individual aligned bases (emitting the
  // base and quality) and grabs a sample of them based on a hash mod of Position.
  PCollection<KV<Position, ReadBaseQuality>> joinReadCounts =
      reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME))
      .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE))
      .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE))
      .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT))
      .apply(ParDo.of(new SplitReads()))
      .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  // Pile up read counts, then join against reference stats.
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, joinReadCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());
  return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));
}
 
Example #4
Source File: JoinTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
PCollection<KV<KeyT, OutputT>> translate(
    Join<LeftT, RightT, KeyT, OutputT> operator,
    PCollection<LeftT> left,
    PCollection<KV<KeyT, LeftT>> leftKeyed,
    PCollection<RightT> reight,
    PCollection<KV<KeyT, RightT>> rightKeyed) {
  final AccumulatorProvider accumulators =
      new LazyAccumulatorProvider(AccumulatorProvider.of(leftKeyed.getPipeline()));
  final TupleTag<LeftT> leftTag = new TupleTag<>();
  final TupleTag<RightT> rightTag = new TupleTag<>();
  final JoinFn<LeftT, RightT, KeyT, OutputT> joinFn =
      getJoinFn(operator, leftTag, rightTag, accumulators);
  return KeyedPCollectionTuple.of(leftTag, leftKeyed)
      .and(rightTag, rightKeyed)
      .apply("co-group-by-key", CoGroupByKey.create())
      .apply(joinFn.getFnName(), ParDo.of(joinFn));
}
 
Example #5
Source File: ValidateRunnerXlangTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class})
public void coGroupByKeyTest() {
  PCollection<KV<Long, String>> col1 =
      testPipeline.apply("createCol1", Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3")));
  PCollection<KV<Long, String>> col2 =
      testPipeline.apply("createCol2", Create.of(KV.of(0L, "4"), KV.of(1L, "5"), KV.of(1L, "6")));
  PCollection<KV<Long, Iterable<String>>> cgbkCol =
      KeyedPCollectionTuple.of("col1", col1)
          .and("col2", col2)
          .apply(External.of(TEST_CGBK_URN, new byte[] {}, expansionAddr));
  PCollection<String> col =
      cgbkCol.apply(
          MapElements.into(TypeDescriptors.strings())
              .via(
                  (KV<Long, Iterable<String>> kv) -> {
                    String[] values = Iterables.toArray(kv.getValue(), String.class);
                    Arrays.sort(values);
                    return String.format("%s:%s", kv.getKey(), String.join(",", values));
                  }));
  PAssert.that(col).containsInAnyOrder("0:1,2,4", "1:3,5,6");
}
 
Example #6
Source File: CoGroup.java    From beam with Apache License 2.0 6 votes vote down vote up
private JoinInformation(
    KeyedPCollectionTuple<Row> keyedPCollectionTuple,
    Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs,
    Schema keySchema,
    Map<String, Schema> componentSchemas,
    Map<Integer, SerializableFunction<Object, Row>> toRows,
    List<String> sortedTags,
    Map<Integer, String> tagToKeyedTag) {
  this.keyedPCollectionTuple = keyedPCollectionTuple;
  this.sideInputs = sideInputs;
  this.keySchema = keySchema;
  this.componentSchemas = componentSchemas;
  this.toRows = toRows;
  this.sortedTags = sortedTags;
  this.tagToKeyedTag = tagToKeyedTag;
}
 
Example #7
Source File: TestExpansionService.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<Long, Iterable<String>>> expand(KeyedPCollectionTuple<Long> input) {
  Set<String> tagSet = ImmutableSet.of("col1", "col2");
  return input
      .apply(CoGroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, KV<Long, Iterable<String>>>() {
                @ProcessElement
                public void processElement(
                    @Element KV<Long, CoGbkResult> kv,
                    OutputReceiver<KV<Long, Iterable<String>>> out) {
                  Iterable<String> iter =
                      () ->
                          tagSet.stream()
                              .flatMap(
                                  (String t) ->
                                      StreamSupport.stream(
                                          kv.getValue().<String>getAll(t).spliterator(),
                                          false))
                              .iterator();
                  out.output(KV.of(kv.getKey(), iter));
                }
              }));
}
 
Example #8
Source File: MultinomialLogisticRegression.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<Integer, List<Double>>> expand(final PCollection<KV<Integer, List<Double>>> model) {
  // Model as a view.
  final PCollectionView<Map<Integer, List<Double>>> modelView = model.apply(View.asMap());

  // Find gradient.
  final PCollection<KV<Integer, List<Double>>> gradient = readInput
    .apply(ParDo.of(
      new CalculateGradient(modelView, numClasses, numFeatures)).withSideInputs(modelView))
    .apply(Combine.perKey(new CombineFunction()));

  // Tags for CoGroupByKey.
  final TupleTag<List<Double>> gradientTag = new TupleTag<>();
  final TupleTag<List<Double>> modelTag = new TupleTag<>();
  final KeyedPCollectionTuple<Integer> coGbkInput = KeyedPCollectionTuple
    .of(gradientTag, gradient)
    .and(modelTag, model);

  final PCollection<KV<Integer, CoGbkResult>> groupResult =
    coGbkInput.apply(CoGroupByKey.create());

  // Update the model
  return groupResult
    .apply(ParDo.of(new ApplyGradient(numFeatures, numClasses, iterationNum, gradientTag, modelTag)));
}
 
Example #9
Source File: TestExpansionService.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public KeyedPCollectionTuple<Long> createInput(
    Pipeline p, Map<String, PCollection<?>> inputs) {
  KeyedPCollectionTuple inputTuple = KeyedPCollectionTuple.empty(p);
  for (Map.Entry<String, PCollection<?>> entry : inputs.entrySet()) {
    inputTuple = inputTuple.and(new TupleTag(entry.getKey()), entry.getValue());
  }
  return inputTuple;
}
 
Example #10
Source File: VerifyBamIdTest.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
@Test
public void testPileupAndJoinReadsWithChrPrefix() throws Exception {
  ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1chrPrefix, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1chrPrefix, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));
  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}
 
Example #11
Source File: VerifyBamIdTest.java    From dataflow-java with Apache License 2.0 5 votes vote down vote up
@Test
public void testPileupAndJoinReads() throws Exception {
  final ReadBaseQuality srq = new ReadBaseQuality("A", 10);
  PCollection<KV<Position, ReadBaseQuality>> readCounts = p.apply(
      "createInput", Create.of(KV.of(position1, srq)));
  PAssert.that(readCounts).containsInAnyOrder(KV.of(position1, srq));

  PCollection<KV<Position, AlleleFreq>> refFreq = p.apply(Create.of(refCountList));

  PAssert.that(refFreq).containsInAnyOrder(refCountList);

  final TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, readCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());

  PCollection<KV<Position, ReadCounts>> result = joined.apply(
      ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));

  KV<Position, ReadCounts> expectedResult1 = KV.of(position1, rc1);
  KV<Position, ReadCounts> expectedResult2 = KV.of(position2, rc2);
  KV<Position, ReadCounts> expectedResult3 = KV.of(position3, rc3);

  PAssert.that(result).containsInAnyOrder(expectedResult1, expectedResult2, expectedResult3);
  p.run();
}
 
Example #12
Source File: GroupWithoutRepartition.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public OutputT expand(InputT input) {
  if (input instanceof PCollection) {
    return (OutputT) ((PCollection) input).apply(transform);
  } else if (input instanceof KeyedPCollectionTuple) {
    return (OutputT) ((KeyedPCollectionTuple) input).apply(transform);
  } else {
    throw new RuntimeException(
        transform.getName()
            + " is not supported with "
            + GroupWithoutRepartition.class.getSimpleName());
  }
}
 
Example #13
Source File: CoGroupByKeyLoadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
void loadTest() throws IOException {
  SyntheticSourceOptions coSourceOptions =
      fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class);

  Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions());

  PCollection<KV<byte[], byte[]>> input =
      pipeline.apply("Read input", readFromSource(sourceOptions));
  input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor));
  input = applyWindowing(input);
  input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep);

  PCollection<KV<byte[], byte[]>> coInput =
      pipeline.apply("Read co-input", readFromSource(coSourceOptions));
  coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor));
  coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec());
  coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep);

  KeyedPCollectionTuple.of(INPUT_TAG, input)
      .and(CO_INPUT_TAG, coInput)
      .apply("CoGroupByKey", CoGroupByKey.create())
      .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations())))
      .apply(
          "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count")))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}
 
Example #14
Source File: Join.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    for (V2 rightValue : rightValuesIterable) {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}
 
Example #15
Source File: Task.java    From beam with Apache License 2.0 5 votes vote down vote up
static PCollection<String> applyTransform(
    PCollection<String> fruits, PCollection<String> countries) {

  TupleTag<String> fruitsTag = new TupleTag<>();
  TupleTag<String> countriesTag = new TupleTag<>();

  MapElements<String, KV<String, String>> mapToAlphabetKv =
      MapElements.into(kvs(strings(), strings()))
          .via(word -> KV.of(word.substring(0, 1), word));

  PCollection<KV<String, String>> fruitsPColl = fruits.apply("Fruit to KV", mapToAlphabetKv);
  PCollection<KV<String, String>> countriesPColl = countries
      .apply("Country to KV", mapToAlphabetKv);

  return KeyedPCollectionTuple
      .of(fruitsTag, fruitsPColl)
      .and(countriesTag, countriesPColl)

      .apply(CoGroupByKey.create())

      .apply(ParDo.of(new DoFn<KV<String, CoGbkResult>, String>() {

        @ProcessElement
        public void processElement(
            @Element KV<String, CoGbkResult> element, OutputReceiver<String> out) {

          String alphabet = element.getKey();
          CoGbkResult coGbkResult = element.getValue();

          String fruit = coGbkResult.getOnly(fruitsTag);
          String country = coGbkResult.getOnly(countriesTag);

          out.output(new WordsAlphabet(alphabet, fruit, country).toString());
        }

      }));
}
 
Example #16
Source File: Snippets.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Using a CoGroupByKey transform. */
public static PCollection<String> coGroupByKeyTuple(
    TupleTag<String> emailsTag,
    TupleTag<String> phonesTag,
    PCollection<KV<String, String>> emails,
    PCollection<KV<String, String>> phones) {

  // [START CoGroupByKeyTuple]
  PCollection<KV<String, CoGbkResult>> results =
      KeyedPCollectionTuple.of(emailsTag, emails)
          .and(phonesTag, phones)
          .apply(CoGroupByKey.create());

  PCollection<String> contactLines =
      results.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> e = c.element();
                  String name = e.getKey();
                  Iterable<String> emailsIter = e.getValue().getAll(emailsTag);
                  Iterable<String> phonesIter = e.getValue().getAll(phonesTag);
                  String formattedResult =
                      Snippets.formatCoGbkResults(name, emailsIter, phonesIter);
                  c.output(formattedResult);
                }
              }));
  // [END CoGroupByKeyTuple]
  return contactLines;
}
 
Example #17
Source File: CompareDatabases.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Long> expand(PBegin begin) {

  final TupleTag<Struct> oneTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one));
  final TupleTag<Struct> twoTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two));

  PCollection<KV<String, CoGbkResult>> cogroup =
      KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create());

  PCollection<String> fails =
      cogroup.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> element = c.element();
                  CoGbkResult gbk = element.getValue();
                  ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag));
                  ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag));

                  if (oneRows.size() != 1 || twoRows.size() != 1) {
                    c.output(element.getKey());
                    return;
                  }

                  Struct sOne = oneRows.get(0);
                  Struct sTwo = twoRows.get(0);

                  if (!sOne.equals(sTwo)) {
                    c.output(element.getKey());
                  }
                }
              }));

  return fails.apply(Count.globally());
}
 
Example #18
Source File: Query8.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<IdNameReserve> expand(PCollection<Event> events) {
  // Window and key new people by their id.
  PCollection<KV<Long, Person>> personsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_PERSONS)
          .apply(
              "Query8.WindowPersons",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("PersonById", NexmarkQueryUtil.PERSON_BY_ID);

  // Window and key new auctions by their id.
  PCollection<KV<Long, Auction>> auctionsBySeller =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply(
              "Query8.WindowAuctions",
              Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))))
          .apply("AuctionBySeller", NexmarkQueryUtil.AUCTION_BY_SELLER);

  // Join people and auctions and project the person id, name and auction reserve price.
  return KeyedPCollectionTuple.of(NexmarkQueryUtil.PERSON_TAG, personsById)
      .and(NexmarkQueryUtil.AUCTION_TAG, auctionsBySeller)
      .apply(CoGroupByKey.create())
      .apply(
          name + ".Select",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, IdNameReserve>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Person person =
                      c.element().getValue().getOnly(NexmarkQueryUtil.PERSON_TAG, null);
                  if (person == null) {
                    // Person was not created in last window period.
                    return;
                  }
                  for (Auction auction :
                      c.element().getValue().getAll(NexmarkQueryUtil.AUCTION_TAG)) {
                    c.output(new IdNameReserve(person.id, person.name, auction.reserve));
                  }
                }
              }));
}
 
Example #19
Source File: WinningBids.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<AuctionBid> expand(PCollection<Event> events) {
  // Window auctions and bids into custom auction windows. New people events will be discarded.
  // This will allow us to bring bids and auctions together irrespective of how long
  // each auction is open for.
  events = events.apply("Window", Window.into(auctionOrBidWindowFn));

  // Key auctions by their id.
  PCollection<KV<Long, Auction>> auctionsById =
      events
          .apply(NexmarkQueryUtil.JUST_NEW_AUCTIONS)
          .apply("AuctionById:", NexmarkQueryUtil.AUCTION_BY_ID);

  // Key bids by their auction id.
  PCollection<KV<Long, Bid>> bidsByAuctionId =
      events
          .apply(NexmarkQueryUtil.JUST_BIDS)
          .apply("BidByAuction", NexmarkQueryUtil.BID_BY_AUCTION);

  // Find the highest price valid bid for each closed auction.
  return
  // Join auctions and bids.
  KeyedPCollectionTuple.of(NexmarkQueryUtil.AUCTION_TAG, auctionsById)
      .and(NexmarkQueryUtil.BID_TAG, bidsByAuctionId)
      .apply(CoGroupByKey.create())
      // Filter and select.
      .apply(
          name + ".Join",
          ParDo.of(
              new DoFn<KV<Long, CoGbkResult>, AuctionBid>() {
                private final Counter noAuctionCounter = Metrics.counter(name, "noAuction");
                private final Counter underReserveCounter = Metrics.counter(name, "underReserve");
                private final Counter noValidBidsCounter = Metrics.counter(name, "noValidBids");

                @ProcessElement
                public void processElement(ProcessContext c) {
                  @Nullable
                  Auction auction =
                      c.element().getValue().getOnly(NexmarkQueryUtil.AUCTION_TAG, null);
                  if (auction == null) {
                    // We have bids without a matching auction. Give up.
                    noAuctionCounter.inc();
                    return;
                  }
                  // Find the current winning bid for auction.
                  // The earliest bid with the maximum price above the reserve wins.
                  Bid bestBid = null;
                  for (Bid bid : c.element().getValue().getAll(NexmarkQueryUtil.BID_TAG)) {
                    // Bids too late for their auction will have been
                    // filtered out by the window merge function.
                    checkState(bid.dateTime.compareTo(auction.expires) < 0);
                    if (bid.price < auction.reserve) {
                      // Bid price is below auction reserve.
                      underReserveCounter.inc();
                      continue;
                    }

                    if (bestBid == null
                        || Bid.PRICE_THEN_DESCENDING_TIME.compare(bid, bestBid) > 0) {
                      bestBid = bid;
                    }
                  }
                  if (bestBid == null) {
                    // We don't have any valid bids for auction.
                    noValidBidsCounter.inc();
                    return;
                  }
                  c.output(new AuctionBid(auction, bestBid));
                }
              }));
}
 
Example #20
Source File: TestExpansionService.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PTransform<KeyedPCollectionTuple<Long>, PCollection<KV<Long, Iterable<String>>>>
    getTransform(RunnerApi.FunctionSpec spec) {
  return new TestCoGroupByKeyTransform();
}
 
Example #21
Source File: Join.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);

  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V2 rightValue : rightValuesIterable) {
                    if (leftValuesIterable.iterator().hasNext()) {
                      for (V1 leftValue : leftValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(nullValue, rightValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}
 
Example #22
Source File: CoGroup.java    From beam with Apache License 2.0 4 votes vote down vote up
private static JoinInformation from(
    PCollectionTuple input,
    Function<String, FieldAccessDescriptor> getFieldAccessDescriptor,
    Function<String, Boolean> getIsSideInput) {
  KeyedPCollectionTuple<Row> keyedPCollectionTuple =
      KeyedPCollectionTuple.empty(input.getPipeline());

  List<String> sortedTags =
      input.getAll().keySet().stream()
          .map(TupleTag::getId)
          .sorted()
          .collect(Collectors.toList());

  // Keep this in a TreeMap so that it's sorted. This way we get a deterministic output
  // schema.
  TreeMap<String, Schema> componentSchemas = Maps.newTreeMap();
  Map<Integer, SerializableFunction<Object, Row>> toRows = Maps.newHashMap();

  Map<String, PCollectionView<Map<Row, Iterable<Row>>>> sideInputs = Maps.newHashMap();
  Map<Integer, String> tagToKeyedTag = Maps.newHashMap();
  Schema keySchema = null;
  for (Map.Entry<TupleTag<?>, PCollection<?>> entry : input.getAll().entrySet()) {
    String tag = entry.getKey().getId();
    int tagIndex = sortedTags.indexOf(tag);
    PCollection<?> pc = entry.getValue();
    Schema schema = pc.getSchema();
    componentSchemas.put(tag, schema);
    toRows.put(tagIndex, (SerializableFunction<Object, Row>) pc.getToRowFunction());
    FieldAccessDescriptor fieldAccessDescriptor = getFieldAccessDescriptor.apply(tag);
    if (fieldAccessDescriptor == null) {
      throw new IllegalStateException("No fields were set for input " + tag);
    }
    // Resolve the key schema, keeping the fields in the order specified by the user.
    // Otherwise, if different field names are specified for different PCollections, they
    // might not match up.
    // The key schema contains the field names from the first PCollection specified.
    FieldAccessDescriptor resolved = fieldAccessDescriptor.resolve(schema);
    Schema currentKeySchema = SelectHelpers.getOutputSchema(schema, resolved);
    if (keySchema == null) {
      keySchema = currentKeySchema;
    } else {
      keySchema = SchemaUtils.mergeWideningNullable(keySchema, currentKeySchema);
    }

    // Create a new tag for the output.
    TupleTag randomTag = new TupleTag<>();
    String keyedTag = tag + "_" + randomTag;
    tagToKeyedTag.put(tagIndex, keyedTag);
    PCollection<KV<Row, Row>> keyedPCollection =
        extractKey(pc, schema, keySchema, resolved, tag);
    if (getIsSideInput.apply(tag)) {
      sideInputs.put(
          keyedTag, keyedPCollection.apply("computeSideInputView" + tag, View.asMultimap()));
    } else {
      keyedPCollectionTuple = keyedPCollectionTuple.and(keyedTag, keyedPCollection);
    }
  }
  return new JoinInformation(
      keyedPCollectionTuple,
      sideInputs,
      keySchema,
      componentSchemas,
      toRows,
      sortedTags,
      tagToKeyedTag);
}
 
Example #23
Source File: Join.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<K, KV<V1, V2>>> expand(PCollection<KV<K, V1>> leftCollection) {
  checkNotNull(leftCollection);
  checkNotNull(rightCollection);
  checkNotNull(nullValue);
  final TupleTag<V1> v1Tuple = new TupleTag<>();
  final TupleTag<V2> v2Tuple = new TupleTag<>();

  PCollection<KV<K, CoGbkResult>> coGbkResultCollection =
      KeyedPCollectionTuple.of(v1Tuple, leftCollection)
          .and(v2Tuple, rightCollection)
          .apply("CoGBK", CoGroupByKey.create());

  return coGbkResultCollection
      .apply(
          "Join",
          ParDo.of(
              new DoFn<KV<K, CoGbkResult>, KV<K, KV<V1, V2>>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<K, CoGbkResult> e = c.element();

                  Iterable<V1> leftValuesIterable = e.getValue().getAll(v1Tuple);
                  Iterable<V2> rightValuesIterable = e.getValue().getAll(v2Tuple);

                  for (V1 leftValue : leftValuesIterable) {
                    if (rightValuesIterable.iterator().hasNext()) {
                      for (V2 rightValue : rightValuesIterable) {
                        c.output(KV.of(e.getKey(), KV.of(leftValue, rightValue)));
                      }
                    } else {
                      c.output(KV.of(e.getKey(), KV.of(leftValue, nullValue)));
                    }
                  }
                }
              }))
      .setCoder(
          KvCoder.of(
              ((KvCoder) leftCollection.getCoder()).getKeyCoder(),
              KvCoder.of(
                  ((KvCoder) leftCollection.getCoder()).getValueCoder(),
                  ((KvCoder) rightCollection.getCoder()).getValueCoder())));
}