Java Code Examples for org.apache.beam.sdk.coders.CoderRegistry#registerCoderForType()

The following examples show how to use org.apache.beam.sdk.coders.CoderRegistry#registerCoderForType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/** Tests that if different headers are found an exception is thrown. */
@Test(expected = RuntimeException.class)
public void testDifferentHeaders() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> headers =
      pipeline.apply("CreateInput", Create.of(HEADER_STRING, "wrong,header,thing\n"));
  PCollection<String> lines = pipeline.apply("Create lines", Create.of(RECORD_STRING));

  PCollectionTuple readCsvHeadersOut =
      PCollectionTuple.of(CSV_HEADERS, headers).and(CSV_LINES, lines);

  PCollectionTuple test =
      readCsvHeadersOut.apply(
          "TestDifferentHeaders",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .setUdfOutputTag(PROCESSING_OUT)
              .build());

  pipeline.run();
}
 
Example 2
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a JSON schema.
 */
@Test
public void testLineToFailsafeJsonNoHeadersJsonSchema() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJson",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example 3
Source File: PubSubToElasticsearchTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with an empty message payload but attributes populated. */
@Test
public void testPubSubToElasticsearchOnlyAttributesE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER);
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER);

  PubSubToElasticsearch.PubSubToElasticsearchOptions options =
          TestPipeline.testingPipelineOptions()
                  .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class);

  options.setDeadletterTable("test:dataset.table");

  PCollectionTuple pc =
          pipeline
                  .apply(Create.of(goodTestMessages.get(goodTestMessages.size() - 1)))
                  .apply(
                          PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder()
                                  .setJavascriptTextTransformFunctionName(
                                          options.getJavascriptTextTransformFunctionName())
                                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                                  .build());

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT))
          .satisfies(
                  collection -> {
                    FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
                    assertThat(
                            new Gson().fromJson(element.getPayload(), HashMap.class),
                            is(equalTo(element.getOriginalPayload().getAttributeMap())));
                    return null;
                  });

  // Execute pipeline
  pipeline.run(options);
}
 
Example 4
Source File: ErrorConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that {@link ErrorConverters.FailedPubsubMessageToTableRowFn} properly formats failed
 * {@link PubsubMessage} objects into {@link TableRow} objects to save to BigQuery.
 */
@Test
public void testFailedPubsubMessageToTableRowFn() {
  // Test input
  final String payload = "Super secret";
  final String errorMessage = "Failed to parse input JSON";
  final String stacktrace = "Error at com.google.cloud.teleport.PubsubToBigQuery";

  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final FailsafeElement<PubsubMessage, String> input =
      FailsafeElement.of(message, payload)
          .setErrorMessage(errorMessage)
          .setStacktrace(stacktrace);

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build pipeline
  PCollection<TableRow> output =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder))
          .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()));

  // Assert
  PAssert.that(output)
      .satisfies(
          collection -> {
            final TableRow result = collection.iterator().next();
            assertThat(result.get("timestamp"), is(equalTo("2022-02-22 22:22:22.222000")));
            assertThat(result.get("attributes"), is(notNullValue()));
            assertThat(result.get("payloadString"), is(equalTo(payload)));
            assertThat(result.get("payloadBytes"), is(notNullValue()));
            assertThat(result.get("errorMessage"), is(equalTo(errorMessage)));
            assertThat(result.get("stacktrace"), is(equalTo(stacktrace)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}
 
Example 5
Source File: CsvToElasticsearchTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link CsvToElasticsearch} pipeline the headers of the Csv to parse it. */
@Test
public void testCsvToElasticsearchHeadersE2E() {

  final String header = "id,state,price";
  final String record = "007,CA,26.23";
  final String stringJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":\"26.23\"}";

  final FailsafeElementCoder<String, String> coder =
      FailsafeElementCoder.of(
          NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  CsvToElasticsearch.CsvToElasticsearchOptions options =
      PipelineOptionsFactory.create().as(CsvToElasticsearch.CsvToElasticsearchOptions.class);

  options.setContainsHeaders(true);
  options.setInputFileSpec(HEADER_CSV_FILE_PATH);

  // Build pipeline with no headers.
  PCollectionTuple readCsvOut =
      pipeline
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .build())
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .setUdfOutputTag(CsvToElasticsearch.PROCESSING_OUT)
                  .setUdfDeadletterTag(CsvToElasticsearch.PROCESSING_DEADLETTER_OUT)
                  .build());

  // Assert
  PAssert.that(readCsvOut.get(CsvToElasticsearch.PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement element = collection.iterator().next();
            assertThat(element.getOriginalPayload(), is(equalTo(record)));
            assertThat(element.getPayload(), is(equalTo(stringJsonRecord)));
            return null;
          });

  //  Execute pipeline
  pipeline.run();
}
 
Example 6
Source File: KafkaToBigQuery.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options. This method does not wait until the
 * pipeline is finished before returning. Invoke {@code result.waitUntilFinish()} on the result
 * object to block until the pipeline is finished running if blocking programmatic execution is
 * required.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
public static PipelineResult run(Options options) {

  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  FailsafeElementCoder<KV<String, String>, String> coder =
      FailsafeElementCoder.of(
          KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  /*
   * Steps:
   *  1) Read messages in from Kafka
   *  2) Transform the Kafka Messages into TableRows
   *     - Transform message payload via UDF
   *     - Convert UDF result to TableRow objects
   *  3) Write successful records out to BigQuery
   *  4) Write failed records out to BigQuery
   */
  PCollectionTuple transformOut =
      pipeline
          /*
           * Step #1: Read messages in from Kafka
           */
          .apply(
              "ReadFromKafka",
              KafkaIO.<String, String>read()
                  .withBootstrapServers(options.getBootstrapServers())
                  .withTopic(options.getInputTopic())
                  .withKeyDeserializer(StringDeserializer.class)
                  .withValueDeserializer(StringDeserializer.class)
                  // NumSplits is hard-coded to 1 for single-partition use cases (e.g., Debezium
                  // Change Data Capture). Once Dataflow dynamic templates are available, this can
                  // be deprecated.
                  .withNumSplits(1)
                  .withoutMetadata())

          /*
           * Step #2: Transform the Kafka Messages into TableRows
           */
          .apply("ConvertMessageToTableRow", new MessageToTableRow(options));

  /*
   * Step #3: Write the successful records out to BigQuery
   */
  transformOut
      .get(TRANSFORM_OUT)
      .apply(
          "WriteSuccessfulRecords",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(CreateDisposition.CREATE_NEVER)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND)
              .to(options.getOutputTableSpec()));

  /*
   * Step #4: Write failed records out to BigQuery
   */
  PCollectionList.of(transformOut.get(UDF_DEADLETTER_OUT))
      .and(transformOut.get(TRANSFORM_DEADLETTER_OUT))
      .apply("Flatten", Flatten.pCollections())
      .apply(
          "WriteFailedRecords",
          WriteKafkaMessageErrors.newBuilder()
              .setErrorRecordsTable(
                  ValueProviderUtils.maybeUseDefaultDeadletterTable(
                      options.getOutputDeadletterTable(),
                      options.getOutputTableSpec(),
                      DEFAULT_DEADLETTER_TABLE_SUFFIX))
              .setErrorRecordsTableSchema(ResourceUtils.getDeadletterTableSchemaJson())
              .build());
  return pipeline.run();
}
 
Example 7
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests the {@link BigQueryConverters.FailsafeJsonToTableRow} transform with invalid JSON input.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJsonToTableRowInvalidJSON() {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "JsonToTableRow",
              FailsafeJsonToTableRow.<PubsubMessage>newBuilder()
                  .setSuccessTag(TABLE_ROW_TAG)
                  .setFailureTag(FAILSAFE_ELM_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(TABLE_ROW_TAG)).empty();
  PAssert.that(output.get(FAILSAFE_ELM_TAG))
      .satisfies(
          collection -> {
            final FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            // Check the individual elements of the PubsubMessage since the message above won't be
            // serializable.
            assertThat(new String(result.getOriginalPayload().getPayload())).isEqualTo(payload);
            assertThat(result.getOriginalPayload().getAttributeMap()).isEqualTo(attributes);
            assertThat(result.getPayload()).isEqualTo(payload);
            assertThat(result.getErrorMessage()).isNotNull();
            assertThat(result.getStacktrace()).isNotNull();
            return null;
          });

  // Execute the test
  pipeline.run();
}
 
Example 8
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a Javascript Udf. Udf processing is handled by {@link
 * JavascriptTextTransformer}. Should output record to deadletter table tag.
 */
@Test
public void testLineToFailsafeJsonNoHeadersUdfDeadletter() {
  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(BAD_JSON_STRING_RECORD).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  CsvConverters.CsvPipelineOptions options =
      PipelineOptionsFactory.create().as(CsvConverters.CsvPipelineOptions.class);

  options.setDelimiter(",");
  options.setJavascriptTextTransformGcsPath(SCRIPT_PARSE_EXCEPTION_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJsonNoHeadersUdfBad",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(options.getDelimiter())
              .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
              .setJsonSchemaPath(options.getJsonSchemaPath())
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT)).empty();
  PAssert.that(failsafe.get(PROCESSING_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(BAD_JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example 9
Source File: ErrorConvertersTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that {@link ErrorConverters.FailedStringToTableRowFn} properly formats failed String
 * objects into {@link TableRow} objects to save to BigQuery.
 */
@Test
public void testFailedStringMessageToTableRowFn() {
  // Test input
  final String message = "Super secret";
  final String errorMessage = "Failed to parse input JSON";
  final String stacktrace = "Error at com.google.cloud.teleport.TextToBigQueryStreaming";

  final FailsafeElement<String, String> input =
      FailsafeElement.of(message, message)
          .setErrorMessage(errorMessage)
          .setStacktrace(stacktrace);

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<String, String> coder =
      FailsafeElementCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build pipeline
  PCollection<TableRow> output =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(input, timestamp)).withCoder(coder))
          .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()));

  // Assert
  PAssert.that(output)
      .satisfies(
          collection -> {
            final TableRow result = collection.iterator().next();
            assertThat(result.get("timestamp")).isEqualTo("2022-02-22 22:22:22.222000");
            assertThat(result.get("attributes")).isNull();
            assertThat(result.get("payloadString")).isEqualTo(message);
            assertThat(result.get("payloadBytes")).isNotNull();
            assertThat(result.get("errorMessage")).isEqualTo(errorMessage);
            assertThat(result.get("stacktrace")).isEqualTo(stacktrace);
            return null;
          });

  // Execute pipeline
  pipeline.run();
}
 
Example 10
Source File: JavascriptTextTransformerTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests the {@link FailsafeJavascriptUdf} when it's passed invalid JSON. In this case the UDF
 * should output the input {@link FailsafeElement} to the dead-letter enriched with error
 * information.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfInvalidInput() {
  // Test input
  final String fileSystemPath = TRANSFORM_FILE_PATH;
  final String functionName = "transform";

  final String payload = "\"ticker\": \"GOOGL\", \"price\": 1006.94";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG)).empty();
  PAssert.that(output.get(FAILURE_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(payload)));
            assertThat(result.getErrorMessage(), is(notNullValue()));
            assertThat(result.getStacktrace(), is(notNullValue()));
            return null;
          });

  // Execute the test
  pipeline.run();
}
 
Example 11
Source File: JavascriptTextTransformerTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link FailsafeJavascriptUdf} when the input is valid. */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfValidInput() {
  // Test input
  final String fileSystemPath = TRANSFORM_FILE_PATH;
  final String functionName = "transform";

  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();
            String expectedPayload =
                "{\"ticker\":\"GOOGL\",\"price\":1006.94,\"someProp\":\"someValue\"}";

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(expectedPayload)));
            assertThat(result.getErrorMessage(), is(nullValue()));
            assertThat(result.getStacktrace(), is(nullValue()));
            return null;
          });

  PAssert.that(output.get(FAILURE_TAG)).empty();

  // Execute the test
  pipeline.run();
}
 
Example 12
Source File: PubSubCdcToBigQueryTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link PubSubCdcToBigQuery} pipeline end-to-end. */
@Test
public void testPubSubCdcToBigQueryApplyJavaScriptUDF() throws Exception {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  final FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Parameters
  String transformPath = TRANSFORM_FILE_PATH;
  String transformFunction = "transform";

  PubSubCdcToBigQuery.Options options =
      PipelineOptionsFactory.create().as(PubSubCdcToBigQuery.Options.class);

  options.setJavascriptTextTransformGcsPath(transformPath);
  options.setJavascriptTextTransformFunctionName(transformFunction);

  // Build pipeline
  PCollectionTuple transformOut =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(message, timestamp))
                  .withCoder(PubsubMessageWithAttributesCoder.of()))
          .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options));

  // Assert
  PAssert.that(transformOut.get(PubSubCdcToBigQuery.UDF_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubCdcToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubCdcToBigQuery.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            TableRow result = collection.iterator().next();
            assertThat(result.get("ticker"), is(equalTo("GOOGL")));
            assertThat(result.get("price"), is(equalTo(1006.94)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}
 
Example 13
Source File: PubsubToBigQueryTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link PubSubToBigQuery} pipeline end-to-end. */
@Test
public void testPubsubToBigQueryE2E() throws Exception {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final PubsubMessage message =
      new PubsubMessage(payload.getBytes(), ImmutableMap.of("id", "123", "type", "custom_event"));

  final Instant timestamp =
      new DateTime(2022, 2, 22, 22, 22, 22, 222, DateTimeZone.UTC).toInstant();

  final FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Parameters
  ValueProvider<String> transformPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  ValueProvider<String> transformFunction = pipeline.newProvider("transform");

  PubSubToBigQuery.Options options =
      PipelineOptionsFactory.create().as(PubSubToBigQuery.Options.class);

  options.setJavascriptTextTransformGcsPath(transformPath);
  options.setJavascriptTextTransformFunctionName(transformFunction);

  // Build pipeline
  PCollectionTuple transformOut =
      pipeline
          .apply(
              "CreateInput",
              Create.timestamped(TimestampedValue.of(message, timestamp))
                  .withCoder(PubsubMessageWithAttributesCoder.of()))
          .apply("ConvertMessageToTableRow", new PubsubMessageToTableRow(options));

  // Assert
  PAssert.that(transformOut.get(PubSubToBigQuery.UDF_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_DEADLETTER_OUT)).empty();
  PAssert.that(transformOut.get(PubSubToBigQuery.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            TableRow result = collection.iterator().next();
            assertThat(result.get("ticker"), is(equalTo("GOOGL")));
            assertThat(result.get("price"), is(equalTo(1006.94)));
            return null;
          });

  // Execute pipeline
  pipeline.run();
}
 
Example 14
Source File: PubSubToMongoDBTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link PubSubToMongoDB} pipeline end-to-end with a bad UDF. */
@Test
public void testPubSubToMongoDBBadUdfE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      PubSubToMongoDB.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(),
      PubSubToMongoDB.FAILSAFE_ELEMENT_CODER);
  coderRegistry.registerCoderForType(
      PubSubToMongoDB.CODER.getEncodedTypeDescriptor(), PubSubToMongoDB.CODER);

  PubSubToMongoDB.Options options =
      TestPipeline.testingPipelineOptions().as(PubSubToMongoDB.Options.class);

  options.setDeadletterTable("test:dataset.table");
  options.setJavascriptTextTransformFunctionName("transformBad");
  options.setJavascriptTextTransformGcsPath(BAD_TRANSFORM_FILE_PATH);

  PCollectionTuple pc =
      pipeline
          .apply(Create.of(badTestMessages.get(0)))
          .apply(
              PubSubToMongoDB.PubSubMessageToJsonDocument.newBuilder()
                  .setJavascriptTextTransformFunctionName(
                      options.getJavascriptTextTransformFunctionName())
                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                  .build());

  PAssert.that(pc.get(PubSubToMongoDB.TRANSFORM_DEADLETTER_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
            assertThat(
                element.getOriginalPayload().getPayload(),
                is(equalTo(badTestMessages.get(0).getPayload())));
            return null;
          });

  PAssert.that(pc.get(PubSubToMongoDB.TRANSFORM_OUT)).empty();

  // Execute pipeline
  pipeline.run(options);
}
 
Example 15
Source File: JavascriptTextTransformerTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Tests the {@link FailsafeJavascriptUdf} when it's passed invalid JSON. In this case the UDF
 * should output the input {@link FailsafeElement} to the dead-letter enriched with error
 * information.
 */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfInvalidInput() {
  // Test input
  final ValueProvider<String> fileSystemPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  final ValueProvider<String> functionName = pipeline.newProvider("transform");

  final String payload = "\"ticker\": \"GOOGL\", \"price\": 1006.94";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG)).empty();
  PAssert.that(output.get(FAILURE_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(payload)));
            assertThat(result.getErrorMessage(), is(notNullValue()));
            assertThat(result.getStacktrace(), is(notNullValue()));
            return null;
          });

  // Execute the test
  pipeline.run();
}
 
Example 16
Source File: CsvToElasticsearch.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline to completion with the specified options.
 *
 * @param options The execution options.
 * @return The pipeline result.
 */
private static PipelineResult run(CsvToElasticsearchOptions options) {
  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  // Register the coder for pipeline
  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
      FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), FAILSAFE_ELEMENT_CODER);

  // Throw error if containsHeaders is true and a schema or Udf is also set.
  if (options.getContainsHeaders()) {
    checkArgument(
        options.getJavascriptTextTransformGcsPath() == null
            && options.getJsonSchemaPath() == null,
        "Cannot parse file containing headers with UDF or Json schema.");
  }

  // Throw error if only one retry configuration parameter is set.
  if (options.getMaxRetryAttempts() != null || options.getMaxRetryDuration() != null) {
    checkArgument(
        options.getMaxRetryAttempts() != null && options.getMaxRetryDuration() != null,
        "To specify retry configuration both max attempts and max duration must be set.");
  }

  /*
   * Steps: 1) Read records from CSV(s) via {@link CsvConverters.ReadCsv}.
   *        2) Convert lines to JSON strings via {@link CsvConverters.LineToFailsafeJson}.
   *        3a) Write JSON strings as documents to Elasticsearch via {@link ElasticsearchIO}.
   *        3b) Write elements that failed processing to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
   */
  PCollectionTuple convertedCsvLines =
      pipeline
          /*
           * Step 1: Read CSV file(s) from Cloud Storage using {@link CsvConverters.ReadCsv}.
           */
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .build())
          /*
           * Step 2: Convert lines to Elasticsearch document.
           */
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CSV_HEADERS)
                  .setLineTag(CSV_LINES)
                  .setUdfOutputTag(PROCESSING_OUT)
                  .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
                  .build());
  /*
   * Step 3a: Write elements that were successfully processed to Elasticsearch using {@link WriteToElasticsearch}.
   */
  convertedCsvLines
      .get(PROCESSING_OUT)
      .apply(
          "GetJsonDocuments",
          MapElements.into(TypeDescriptors.strings()).via(FailsafeElement::getPayload))
      .apply(
          "WriteToElasticsearch",
          WriteToElasticsearch.newBuilder()
              .setOptions(options.as(WriteToElasticsearchOptions.class))
              .build());

  /*
   * Step 3b: Write elements that failed processing to deadletter table via {@link BigQueryIO}.
   */
  convertedCsvLines
      .get(PROCESSING_DEADLETTER_OUT)
      .apply(
          "AddTimestamps",
          WithTimestamps.of((FailsafeElement<String, String> failures) -> new Instant()))
      .apply(
          "WriteFailedElementsToBigQuery",
          WriteStringMessageErrors.newBuilder()
              .setErrorRecordsTable(options.getDeadletterTable())
              .setErrorRecordsTableSchema(SchemaUtils.DEADLETTER_SCHEMA)
              .build());

  return pipeline.run();
}
 
Example 17
Source File: JavascriptTextTransformerTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link FailsafeJavascriptUdf} when the input is valid. */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJavaScriptUdfValidInput() {
  // Test input
  final ValueProvider<String> fileSystemPath = pipeline.newProvider(TRANSFORM_FILE_PATH);
  final ValueProvider<String> functionName = pipeline.newProvider("transform");

  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "InvokeUdf",
              FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                  .setFileSystemPath(fileSystemPath)
                  .setFunctionName(functionName)
                  .setSuccessTag(SUCCESS_TAG)
                  .setFailureTag(FAILURE_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(SUCCESS_TAG))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> result = collection.iterator().next();
            PubsubMessage resultMessage = result.getOriginalPayload();
            String expectedPayload =
                "{\"ticker\":\"GOOGL\",\"price\":1006.94,\"someProp\":\"someValue\"}";

            assertThat(new String(resultMessage.getPayload()), is(equalTo(payload)));
            assertThat(resultMessage.getAttributeMap(), is(equalTo(attributes)));
            assertThat(result.getPayload(), is(equalTo(expectedPayload)));
            assertThat(result.getErrorMessage(), is(nullValue()));
            assertThat(result.getStacktrace(), is(nullValue()));
            return null;
          });

  PAssert.that(output.get(FAILURE_TAG)).empty();

  // Execute the test
  pipeline.run();
}
 
Example 18
Source File: PubSubToElasticsearchTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with a bad UDF. */
@Test
public void testPubSubToElasticsearchBadUdfE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER);
  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER);

  PubSubToElasticsearch.PubSubToElasticsearchOptions options =
          TestPipeline.testingPipelineOptions()
                  .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class);

  options.setDeadletterTable("test:dataset.table");
  options.setJavascriptTextTransformFunctionName("transformBad");
  options.setJavascriptTextTransformGcsPath(BAD_TRANSFORM_FILE_PATH);

  PCollectionTuple pc =
          pipeline
                  .apply(Create.of(badTestMessages.get(0)))
                  .apply(
                          PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder()
                                  .setJavascriptTextTransformFunctionName(
                                          options.getJavascriptTextTransformFunctionName())
                                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                                  .build());

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_DEADLETTER_OUT))
          .satisfies(
                  collection -> {
                    FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
                    assertThat(
                            element.getOriginalPayload().getPayload(),
                            is(equalTo(badTestMessages.get(0).getPayload())));
                    return null;
                  });

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT)).empty();

  // Execute pipeline
  pipeline.run(options);
}
 
Example 19
Source File: CsvToElasticsearchTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link CsvToElasticsearch} pipeline using a Udf to parse the Csv. */
@Test
public void testCsvToElasticsearchUdfE2E() {

  final String record = "007,CA,26.23";
  final String stringifiedJsonRecord = "{\"id\":\"007\",\"state\":\"CA\",\"price\":26.23}";

  final FailsafeElementCoder<String, String> coder =
      FailsafeElementCoder.of(
          NullableCoder.of(StringUtf8Coder.of()), NullableCoder.of(StringUtf8Coder.of()));

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  CsvToElasticsearch.CsvToElasticsearchOptions options =
      PipelineOptionsFactory.create().as(CsvToElasticsearch.CsvToElasticsearchOptions.class);

  options.setJavascriptTextTransformGcsPath(TRANSFORM_FILE_PATH);
  options.setJavascriptTextTransformFunctionName("transform");
  options.setContainsHeaders(false);
  options.setInputFileSpec(NO_HEADER_CSV_FILE_PATH);

  // Build pipeline with no headers.
  PCollectionTuple readCsvOut =
      pipeline
          .apply(
              "ReadCsv",
              CsvConverters.ReadCsv.newBuilder()
                  .setCsvFormat(options.getCsvFormat())
                  .setDelimiter(options.getDelimiter())
                  .setHasHeaders(options.getContainsHeaders())
                  .setInputFileSpec(options.getInputFileSpec())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .build())
          .apply(
              "ConvertLine",
              CsvConverters.LineToFailsafeJson.newBuilder()
                  .setDelimiter(options.getDelimiter())
                  .setUdfFileSystemPath(options.getJavascriptTextTransformGcsPath())
                  .setUdfFunctionName(options.getJavascriptTextTransformFunctionName())
                  .setJsonSchemaPath(options.getJsonSchemaPath())
                  .setHeaderTag(CsvToElasticsearch.CSV_HEADERS)
                  .setLineTag(CsvToElasticsearch.CSV_LINES)
                  .setUdfOutputTag(CsvToElasticsearch.PROCESSING_OUT)
                  .setUdfDeadletterTag(CsvToElasticsearch.PROCESSING_DEADLETTER_OUT)
                  .build());

  // Assert
  PAssert.that(readCsvOut.get(CsvToElasticsearch.PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement element = collection.iterator().next();
            assertThat(element.getOriginalPayload(), is(equalTo(record)));
            assertThat(element.getPayload(), is(equalTo(stringifiedJsonRecord)));
            return null;
          });

  //  Execute pipeline
  pipeline.run();
}
 
Example 20
Source File: PubSubToElasticsearchTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Tests the {@link PubSubToElasticsearch} pipeline end-to-end with no UDF supplied. */
@Test
public void testPubSubToElasticsearchNoUdfE2E() {

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();

  coderRegistry.registerCoderForType(
          PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.FAILSAFE_ELEMENT_CODER);

  coderRegistry.registerCoderForType(
      PubSubToElasticsearch.CODER.getEncodedTypeDescriptor(), PubSubToElasticsearch.CODER);

  PubSubToElasticsearch.PubSubToElasticsearchOptions options =
      TestPipeline.testingPipelineOptions()
          .as(PubSubToElasticsearch.PubSubToElasticsearchOptions.class);

  options.setDeadletterTable("test:dataset.table");
  options.setJavascriptTextTransformFunctionName(null);
  options.setJavascriptTextTransformGcsPath(null);

  PCollectionTuple pc =
      pipeline
          .apply(Create.of(goodTestMessages.get(0)))
          .apply(
              PubSubToElasticsearch.PubSubMessageToJsonDocument.newBuilder()
                  .setJavascriptTextTransformFunctionName(
                      options.getJavascriptTextTransformFunctionName())
                  .setJavascriptTextTransformGcsPath(options.getJavascriptTextTransformGcsPath())
                  .build());

  PAssert.that(pc.get(PubSubToElasticsearch.TRANSFORM_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<PubsubMessage, String> element = collection.iterator().next();
            assertThat(
                element.getOriginalPayload().getPayload(),
                is(equalTo(goodTestMessages.get(0).getPayload())));
            return null;
          });

  // Execute pipeline
  pipeline.run(options);
}