Java Code Examples for org.apache.beam.sdk.values.PCollectionTuple#of()

The following examples show how to use org.apache.beam.sdk.values.PCollectionTuple#of() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CsvConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin input) {

  if (hasHeaders()) {
    return input
        .apply("MatchFilePattern", FileIO.match().filepattern(inputFileSpec()))
        .apply("ReadMatches", FileIO.readMatches())
        .apply(
            "ReadCsvWithHeaders",
            ParDo.of(new GetCsvHeadersFn(headerTag(), lineTag(), csvFormat(), delimiter()))
                .withOutputTags(headerTag(), TupleTagList.of(lineTag())));
  }

  return PCollectionTuple.of(
      lineTag(), input.apply("ReadCsvWithoutHeaders", TextIO.read().from(inputFileSpec())));
}
 
Example 2
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnnestLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c'])"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}
 
Example 3
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnnestNamedLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c']) AS t(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}
 
Example 4
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a JSON schema.
 */
@Test
public void testLineToFailsafeJsonNoHeadersJsonSchema() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJson",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example 5
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnnestCrossJoin() {
  Row row1 =
      Row.withSchema(INPUT_SCHEMA)
          .addValues(42)
          .addArray(Arrays.asList("111", "222", "333"))
          .build();

  Row row2 =
      Row.withSchema(INPUT_SCHEMA).addValues(13).addArray(Arrays.asList("444", "555")).build();

  PCollection<Row> input =
      pipeline.apply("boundedInput1", Create.of(row1, row2).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addInt32Field("f_int").addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query(
              "SELECT f_int, arrElems.f_string FROM main "
                  + " CROSS JOIN UNNEST (main.f_stringArr) AS arrElems(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues(42, "111").build(),
          Row.withSchema(resultType).addValues(42, "222").build(),
          Row.withSchema(resultType).addValues(42, "333").build(),
          Row.withSchema(resultType).addValues(13, "444").build(),
          Row.withSchema(resultType).addValues(13, "555").build());

  pipeline.run();
}
 
Example 6
Source File: PipelineTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Tests that Pipeline supports pulling an element out of a tuple as a transform. */
@Test
@Category(ValidatesRunner.class)
public void testTupleProjectionTransform() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3, 4));

  TupleTag<Integer> tag = new TupleTag<>();
  PCollectionTuple tuple = PCollectionTuple.of(tag, input);

  PCollection<Integer> output = tuple.apply("ProjectTag", new TupleProjectionTransform<>(tag));

  PAssert.that(output).containsInAnyOrder(1, 2, 3, 4);
  pipeline.run();
}
 
Example 7
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdf() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdf",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT)).empty();

  pipeline.run();
}
 
Example 8
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}. Should output record to deadletter table tag.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdfBad",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example 9
Source File: PipelineTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<T> input) {
  return PCollectionTuple.of(tag, input);
}
 
Example 10
Source File: TestUtils.java    From beam with Apache License 2.0 2 votes vote down vote up
public static <T> PCollectionTuple tuple(String tag, PCollection<T> pCollection) {

    return PCollectionTuple.of(new TupleTag<>(tag), pCollection);
  }