Java Code Examples for org.apache.beam.sdk.values.PCollectionTuple#apply()

The following examples show how to use org.apache.beam.sdk.values.PCollectionTuple#apply() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BeamSqlDslJoinTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testRejectsGlobalWindowsWithDefaultTriggerInUnboundedInput() throws Exception {

  String sql =
      "SELECT *  "
          + "FROM ORDER_DETAILS1 o1"
          + " JOIN ORDER_DETAILS2 o2"
          + " on "
          + " o1.order_id=o2.site_id AND o2.price=o1.site_id";

  PCollection<Row> orders = ordersUnbounded();
  PCollectionTuple inputs = tuple("ORDER_DETAILS1", orders, "ORDER_DETAILS2", orders);

  thrown.expect(UnsupportedOperationException.class);
  thrown.expectMessage(
      stringContainsInOrder(Arrays.asList("once per window", "default trigger")));

  inputs.apply("sql", SqlTransform.query(sql));

  pipeline.run();
}
 
Example 2
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnnestLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c'])"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}
 
Example 3
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnnestNamedLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c']) AS t(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}
 
Example 4
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a JSON schema.
 */
@Test
public void testLineToFailsafeJsonNoHeadersJsonSchema() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJson",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example 5
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/** Tests that if different headers are found an exception is thrown. */
@Test(expected = RuntimeException.class)
public void testDifferentHeaders() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> headers =
      pipeline.apply("CreateInput", Create.of(HEADER_STRING, "wrong,header,thing\n"));
  PCollection<String> lines = pipeline.apply("Create lines", Create.of(RECORD_STRING));

  PCollectionTuple readCsvHeadersOut =
      PCollectionTuple.of(CSV_HEADERS, headers).and(CSV_LINES, lines);

  PCollectionTuple test =
      readCsvHeadersOut.apply(
          "TestDifferentHeaders",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .setUdfOutputTag(PROCESSING_OUT)
              .build());

  pipeline.run();
}
 
Example 6
Source File: BeamSqlDslJoinTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testRejectsUnboundedWithinWindowsWithEndOfWindowTrigger() throws Exception {

  String sql =
      "SELECT o1.order_id, o1.price, o1.site_id, o2.order_id, o2.price, o2.site_id  "
          + "FROM ORDER_DETAILS1 o1"
          + " JOIN ORDER_DETAILS2 o2"
          + " on "
          + " o1.order_id=o2.site_id AND o2.price=o1.site_id";

  PCollection<Row> orders =
      ordersUnbounded()
          .apply(
              "window",
              Window.<Row>into(FixedWindows.of(Duration.standardSeconds(50)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .withAllowedLateness(Duration.ZERO)
                  .accumulatingFiredPanes());
  PCollectionTuple inputs = tuple("ORDER_DETAILS1", orders, "ORDER_DETAILS2", orders);

  thrown.expect(UnsupportedOperationException.class);
  thrown.expectMessage(
      stringContainsInOrder(Arrays.asList("once per window", "default trigger")));

  inputs.apply("sql", SqlTransform.query(sql));

  pipeline.run();
}
 
Example 7
Source File: BeamSqlDslJoinTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testRejectsGlobalWindowsWithEndOfWindowTrigger() throws Exception {

  String sql =
      "SELECT o1.order_id, o1.price, o1.site_id, o2.order_id, o2.price, o2.site_id  "
          + "FROM ORDER_DETAILS1 o1"
          + " JOIN ORDER_DETAILS2 o2"
          + " on "
          + " o1.order_id=o2.site_id AND o2.price=o1.site_id";

  PCollection<Row> orders =
      ordersUnbounded()
          .apply(
              "window",
              Window.<Row>into(new GlobalWindows())
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .withAllowedLateness(Duration.ZERO)
                  .accumulatingFiredPanes());
  PCollectionTuple inputs = tuple("ORDER_DETAILS1", orders, "ORDER_DETAILS2", orders);

  thrown.expect(UnsupportedOperationException.class);
  thrown.expectMessage(
      stringContainsInOrder(Arrays.asList("once per window", "default trigger")));

  inputs.apply("sql", SqlTransform.query(sql));

  pipeline.run();
}
 
Example 8
Source File: BeamSqlDslJoinTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testRejectsNonGlobalWindowsWithRepeatingTrigger() throws Exception {

  String sql =
      "SELECT o1.order_id, o1.price, o1.site_id, o2.order_id, o2.price, o2.site_id  "
          + "FROM ORDER_DETAILS1 o1"
          + " JOIN ORDER_DETAILS2 o2"
          + " on "
          + " o1.order_id=o2.site_id AND o2.price=o1.site_id";

  PCollection<Row> orders =
      ordersUnbounded()
          .apply(
              "window",
              Window.<Row>into(FixedWindows.of(Duration.standardSeconds(203)))
                  .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow()))
                  .withAllowedLateness(Duration.standardMinutes(2))
                  .accumulatingFiredPanes());
  PCollectionTuple inputs = tuple("ORDER_DETAILS1", orders, "ORDER_DETAILS2", orders);

  thrown.expect(UnsupportedOperationException.class);
  thrown.expectMessage(
      stringContainsInOrder(Arrays.asList("once per window", "default trigger")));

  inputs.apply("sql", SqlTransform.query(sql));

  pipeline.run();
}
 
Example 9
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnnestCrossJoin() {
  Row row1 =
      Row.withSchema(INPUT_SCHEMA)
          .addValues(42)
          .addArray(Arrays.asList("111", "222", "333"))
          .build();

  Row row2 =
      Row.withSchema(INPUT_SCHEMA).addValues(13).addArray(Arrays.asList("444", "555")).build();

  PCollection<Row> input =
      pipeline.apply("boundedInput1", Create.of(row1, row2).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addInt32Field("f_int").addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query(
              "SELECT f_int, arrElems.f_string FROM main "
                  + " CROSS JOIN UNNEST (main.f_stringArr) AS arrElems(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues(42, "111").build(),
          Row.withSchema(resultType).addValues(42, "222").build(),
          Row.withSchema(resultType).addValues(42, "333").build(),
          Row.withSchema(resultType).addValues(13, "444").build(),
          Row.withSchema(resultType).addValues(13, "555").build());

  pipeline.run();
}
 
Example 10
Source File: PipelineTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Tests that Pipeline supports pulling an element out of a tuple as a transform. */
@Test
@Category(ValidatesRunner.class)
public void testTupleProjectionTransform() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3, 4));

  TupleTag<Integer> tag = new TupleTag<>();
  PCollectionTuple tuple = PCollectionTuple.of(tag, input);

  PCollection<Integer> output = tuple.apply("ProjectTag", new TupleProjectionTransform<>(tag));

  PAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
  pipeline.run();
}
 
Example 11
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdf() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdf",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)).empty();

  pipeline.run();
}
 
Example 12
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}. Should output record to deadletter table tag.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdfBad",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example 13
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using the headers of the Csv file.
 */
@Test
public void testLineToFailsafeJsonHeaders() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply("Create lines", Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollection<String> header =
      pipeline.apply("Create headers", Create.of(HEADER_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines).and(CSV_HEADERS, header);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformFunctionName(null);
  options.setJavascriptTextTransformGcsPath(null);
  options.setJsonSchemaPath(null);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonHeaders",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRINGS_RECORD)));
            return null;
          });

  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)).empty();

  pipeline.run();
}
 
Example 14
Source File: BigQueryToDatastore.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs a pipeline which reads data from BigQuery and writes it to Datastore.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {

  BigQueryToDatastoreOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryToDatastoreOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  // Read from BigQuery and convert data to Datastore Entity format with 2 possible outcomes,
  // success or failure, based on the possibility to create valid Entity keys from BQ data
  TupleTag<Entity> successTag = new TupleTag<Entity>() {};
  TupleTag<String> failureTag = new TupleTag<String>("failures") {};
  PCollectionTuple entities =
      pipeline.apply(
          BigQueryToEntity.newBuilder()
              .setQuery(options.getReadQuery())
              .setUniqueNameColumn(options.getReadIdColumn())
              .setEntityKind(options.getDatastoreWriteEntityKind())
              .setNamespace(options.getDatastoreWriteNamespace())
              .setSuccessTag(successTag)
              .setFailureTag(failureTag)
              .build());

  // Write on GCS data that could not be converted to valid Datastore entities
  entities.apply(
      LogErrors.newBuilder()
          .setErrorWritePath(options.getInvalidOutputPath())
          .setErrorTag(failureTag)
          .build());

  // Write valid entities to Datastore
  TupleTag<String> errorTag = new TupleTag<String>("errors") {};
  entities
      .get(successTag)
      .apply(
          WriteEntities.newBuilder()
              .setProjectId(options.getDatastoreWriteProjectId())
              .setErrorTag(errorTag)
              .build())
      .apply(
          LogErrors.newBuilder()
              .setErrorWritePath(options.getErrorWritePath())
              .setErrorTag(errorTag)
              .build());

  pipeline.run();
}
 
Example 15
Source File: Join.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection lhs) {
  FieldsEqual.Impl resolvedPredicate = predicate.resolve(lhs.getSchema(), rhs.getSchema());
  PCollectionTuple tuple = PCollectionTuple.of(LHS_TAG, lhs).and(RHS_TAG, rhs);
  switch (joinType) {
    case INNER:
      return tuple.apply(
          CoGroup.join(LHS_TAG, CoGroup.By.fieldAccessDescriptor(resolvedPredicate.lhs))
              .join(RHS_TAG, CoGroup.By.fieldAccessDescriptor(resolvedPredicate.rhs))
              .crossProductJoin());
    case INNER_BROADCAST:
      return tuple.apply(
          CoGroup.join(LHS_TAG, CoGroup.By.fieldAccessDescriptor(resolvedPredicate.lhs))
              .join(
                  RHS_TAG,
                  CoGroup.By.fieldAccessDescriptor(resolvedPredicate.rhs).withSideInput())
              .crossProductJoin());
    case OUTER:
      return tuple.apply(
          CoGroup.join(
                  LHS_TAG,
                  CoGroup.By.fieldAccessDescriptor(resolvedPredicate.lhs)
                      .withOptionalParticipation())
              .join(
                  RHS_TAG,
                  CoGroup.By.fieldAccessDescriptor(resolvedPredicate.rhs)
                      .withOptionalParticipation())
              .crossProductJoin());
    case LEFT_OUTER:
      return tuple.apply(
          CoGroup.join(LHS_TAG, CoGroup.By.fieldAccessDescriptor(resolvedPredicate.lhs))
              .join(
                  RHS_TAG,
                  CoGroup.By.fieldAccessDescriptor(resolvedPredicate.rhs)
                      .withOptionalParticipation())
              .crossProductJoin());
    case LEFT_OUTER_BROADCAST:
      return tuple.apply(
          CoGroup.join(LHS_TAG, CoGroup.By.fieldAccessDescriptor(resolvedPredicate.lhs))
              .join(
                  RHS_TAG,
                  CoGroup.By.fieldAccessDescriptor(resolvedPredicate.rhs)
                      .withOptionalParticipation()
                      .withSideInput())
              .crossProductJoin());
    case RIGHT_OUTER:
      return tuple.apply(
          CoGroup.join(
                  LHS_TAG,
                  CoGroup.By.fieldAccessDescriptor(resolvedPredicate.lhs)
                      .withOptionalParticipation())
              .join(RHS_TAG, CoGroup.By.fieldAccessDescriptor(resolvedPredicate.rhs))
              .crossProductJoin());
    default:
      throw new RuntimeException("Unexpected join type");
  }
}