org.apache.beam.sdk.io.kafka.KafkaIO Java Examples

The following examples show how to use org.apache.beam.sdk.io.kafka.KafkaIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ReadFromSource.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin input) {
  return input
      .getPipeline()
      .apply(
          "ReadFromKafka",
          KafkaIO.readBytes()
              .withBootstrapServers(getSource().getKafkaSourceConfig().getBootstrapServers())
              .withTopic(getSource().getKafkaSourceConfig().getTopic())
              .withConsumerConfigUpdates(
                  ImmutableMap.of(
                      "group.id",
                      generateConsumerGroupId(input.getPipeline().getOptions().getJobName())))
              .withReadCommitted()
              .commitOffsetsInFinalize())
      .apply(
          "KafkaRecordToFeatureRow",
          ParDo.of(
                  KafkaRecordToFeatureRowDoFn.newBuilder()
                      .setSuccessTag(getSuccessTag())
                      .setFailureTag(getFailureTag())
                      .build())
              .withOutputTags(getSuccessTag(), TupleTagList.of(getFailureTag())));
}
 
Example #2
Source File: StreamWordCount.java    From beam-starter with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
        .as(Options.class);
    options.setRunner(FlinkRunner.class);

    Pipeline p = Pipeline.create(options);

    KafkaIO.Read<byte[], String> kafkaIOReader = KafkaIO.read()
        .withBootstrapServers("192.168.99.100:32771")
        .withTopics(Arrays.asList("beam".split(",")))
        .updateConsumerProperties(ImmutableMap.of("auto.offset.reset", (Object)"earliest"))
        .withValueCoder(StringUtf8Coder.of());

    p.apply(kafkaIOReader.withoutMetadata())
        .apply(Values.<String>create())
        .apply(Window.<String>into(
          FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))))
        .apply(new CountWords())
        .apply(MapElements.via(new FormatAsTextFn()))
        .apply("WriteCounts", TextIO.Write.to(options.getOutput()));

    p.run();
  }
 
Example #3
Source File: BeamKafkaTable.java    From beam with Apache License 2.0 6 votes vote down vote up
KafkaIO.Read<byte[], byte[]> createKafkaRead() {
  KafkaIO.Read<byte[], byte[]> kafkaRead;
  if (topics != null) {
    kafkaRead =
        KafkaIO.<byte[], byte[]>read()
            .withBootstrapServers(bootstrapServers)
            .withTopics(topics)
            .withConsumerConfigUpdates(configUpdates)
            .withKeyDeserializerAndCoder(ByteArrayDeserializer.class, ByteArrayCoder.of())
            .withValueDeserializerAndCoder(ByteArrayDeserializer.class, ByteArrayCoder.of());
  } else if (topicPartitions != null) {
    kafkaRead =
        KafkaIO.<byte[], byte[]>read()
            .withBootstrapServers(bootstrapServers)
            .withTopicPartitions(topicPartitions)
            .withConsumerConfigUpdates(configUpdates)
            .withKeyDeserializerAndCoder(ByteArrayDeserializer.class, ByteArrayCoder.of())
            .withValueDeserializerAndCoder(ByteArrayDeserializer.class, ByteArrayCoder.of());
  } else {
    throw new InvalidTableException("One of topics and topicPartitions must be configurated.");
  }
  return kafkaRead;
}
 
Example #4
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Return source of events from Kafka. */
private PCollection<Event> sourceEventsFromKafka(Pipeline p, final Instant now) {
  checkArgument((options.getBootstrapServers() != null), "Missing --bootstrapServers");
  NexmarkUtils.console("Reading events from Kafka Topic %s", options.getKafkaTopic());

  KafkaIO.Read<Long, byte[]> read =
      KafkaIO.<Long, byte[]>read()
          .withBootstrapServers(options.getBootstrapServers())
          .withTopic(options.getKafkaTopic())
          .withKeyDeserializer(LongDeserializer.class)
          .withValueDeserializer(ByteArrayDeserializer.class)
          .withStartReadTime(now)
          .withMaxNumRecords(
              options.getNumEvents() != null ? options.getNumEvents() : Long.MAX_VALUE);

  return p.apply(queryName + ".ReadKafkaEvents", read.withoutMetadata())
      .apply(queryName + ".KafkaToEvents", ParDo.of(BYTEARRAY_TO_EVENT));
}
 
Example #5
Source File: WriteFeatureSetSpecAck.java    From feast with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<FeatureSetReference> input) {
  return input
      .apply("Prepare", new PrepareWrite(getSinksCount()))
      .apply("FeatureSetSpecToAckMessage", ParDo.of(new BuildAckMessage()))
      .apply(
          "ToKafka",
          KafkaIO.<String, byte[]>write()
              .withBootstrapServers(
                  getSpecsStreamingUpdateConfig().getAck().getBootstrapServers())
              .withTopic(getSpecsStreamingUpdateConfig().getAck().getTopic())
              .withKeySerializer(StringSerializer.class)
              .withValueSerializer(ByteArraySerializer.class));
}
 
Example #6
Source File: ReadFeatureSetSpecs.java    From feast with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<FeatureSetReference, FeatureSetSpec>> expand(PBegin input) {
  return input
      .apply(
          KafkaIO.readBytes()
              .withBootstrapServers(
                  getSpecsStreamingUpdateConfig().getSource().getBootstrapServers())
              .withTopic(getSpecsStreamingUpdateConfig().getSource().getTopic())
              .withConsumerConfigUpdates(
                  ImmutableMap.of(
                      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,
                      "earliest",
                      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,
                      false)))
      .apply("ParseFeatureSetSpec", ParDo.of(new KafkaRecordToFeatureSetSpec()))
      .apply("OnlyRelevantSpecs", Filter.by(new FilterRelevantFunction(getSource(), getStores())))
      .apply(
          Window.<KV<String, FeatureSetSpec>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .accumulatingFiredPanes()
              .withAllowedLateness(Duration.ZERO))
      .apply(
          Combine.perKey(
              (SerializableFunction<Iterable<FeatureSetSpec>, FeatureSetSpec>)
                  specs -> {
                    ArrayList<FeatureSetSpec> featureSetSpecs = Lists.newArrayList(specs);
                    featureSetSpecs.sort(
                        Comparator.comparing(FeatureSetSpec::getVersion).reversed());
                    return featureSetSpecs.get(0);
                  }))
      .apply("CreateFeatureSetReferenceKey", ParDo.of(new CreateFeatureSetReference()))
      .setCoder(
          KvCoder.of(
              AvroCoder.of(FeatureSetReference.class), ProtoCoder.of(FeatureSetSpec.class)));
}
 
Example #7
Source File: BeamKafkaTable.java    From beam with Apache License 2.0 5 votes vote down vote up
private KafkaIO.Write<byte[], byte[]> createKafkaWrite() {
  return KafkaIO.<byte[], byte[]>write()
      .withBootstrapServers(bootstrapServers)
      .withTopic(topics.get(0))
      .withKeySerializer(ByteArraySerializer.class)
      .withValueSerializer(ByteArraySerializer.class);
}
 
Example #8
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Send {@code events} to Kafka. */
private void sinkEventsToKafka(PCollection<Event> events) {
  checkArgument((options.getBootstrapServers() != null), "Missing --bootstrapServers");
  NexmarkUtils.console("Writing events to Kafka Topic %s", options.getKafkaTopic());

  PCollection<byte[]> eventToBytes = events.apply("Event to bytes", ParDo.of(EVENT_TO_BYTEARRAY));
  eventToBytes.apply(
      KafkaIO.<Void, byte[]>write()
          .withBootstrapServers(options.getBootstrapServers())
          .withTopic(options.getKafkaTopic())
          .withValueSerializer(ByteArraySerializer.class)
          .values());
}
 
Example #9
Source File: NexmarkLauncher.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Send {@code formattedResults} to Kafka. */
private void sinkResultsToKafka(PCollection<String> formattedResults) {
  checkArgument((options.getBootstrapServers() != null), "Missing --bootstrapServers");
  NexmarkUtils.console("Writing results to Kafka Topic %s", options.getKafkaResultsTopic());

  formattedResults.apply(
      queryName + ".WriteKafkaResults",
      KafkaIO.<Void, String>write()
          .withBootstrapServers(options.getBootstrapServers())
          .withTopic(options.getKafkaResultsTopic())
          .withValueSerializer(StringSerializer.class)
          .values());
}
 
Example #10
Source File: SyntheticDataPublisher.java    From beam with Apache License 2.0 5 votes vote down vote up
private static void writeToKafka(PCollection<KV<byte[], byte[]>> collection) {
  collection
      .apply("Map to Kafka messages", MapElements.via(new MapKVToString()))
      .apply(
          "Write to Kafka",
          KafkaIO.<Void, String>write()
              .withBootstrapServers(options.getKafkaBootstrapServerAddress())
              .withTopic(options.getKafkaTopic())
              .withValueSerializer(StringSerializer.class)
              .values());
}
 
Example #11
Source File: KafkaOutputPTransformRuntime.java    From components with Apache License 2.0 4 votes vote down vote up
@Override
public PDone expand(PCollection<IndexedRecord> objectPCollection) {
    final boolean useAvro =
            properties.getDatasetProperties().valueFormat.getValue() == KafkaDatasetProperties.ValueFormat.AVRO;
    final String kafkaDatasetStringSchema = properties.getDatasetProperties().avroSchema.getValue();
    final boolean useCustomAvroSchema = properties.getDatasetProperties().isHierarchy.getValue();
    final IndexedRecordHelper indexedRecordHelper =
            new IndexedRecordHelper(kafkaDatasetStringSchema, useCustomAvroSchema);

    KafkaIO.Write<byte[], byte[]> kafkaWrite = KafkaIO
            .<byte[], byte[]> write()
            .withBootstrapServers(properties.getDatasetProperties().getDatastoreProperties().brokers.getValue())
            .withTopic(properties.getDatasetProperties().topic.getValue())
            .withKeySerializer(ByteArraySerializer.class)
            .withValueSerializer(ByteArraySerializer.class)
            .updateProducerProperties(KafkaConnection.createOutputMaps(properties));

    switch (properties.partitionType.getValue()) {
    case COLUMN: {
        PCollection pc1 = objectPCollection.apply(WithKeys.of(new ProduceKey(properties.keyColumn.getValue())));
        if (useAvro) {
            // TODO for now use incoming avro schema directly, do not check configured schema, improvement it.
            return ((PCollection<KV<byte[], byte[]>>) pc1
                    .apply(ParDo.of(new AvroKVToByteArrayDoFn(indexedRecordHelper)))).apply(kafkaWrite);
        } else { // csv
            return ((PCollection<KV<byte[], byte[]>>) pc1
                    .apply(MapElements.via(new FormatCsvKV(properties.getDatasetProperties().getFieldDelimiter()))))
                            .apply(kafkaWrite);
        }
    }
    case ROUND_ROBIN: {
        if (useAvro) {
            // TODO for now use incoming avro schema directly, do not check configured schema, improvement it.
            return (PDone) objectPCollection.apply(ParDo.of(new AvroToByteArrayDoFn(indexedRecordHelper))).apply(
                    kafkaWrite.values());
        } else { // csv
            return (PDone) objectPCollection
                    .apply(MapElements.via(new FormatCsv(properties.getDatasetProperties().getFieldDelimiter())))
                    .apply(kafkaWrite.values());
        }
    }
    default:
        throw new RuntimeException("To be implemented: " + properties.partitionType.getValue());
    }
}
 
Example #12
Source File: BeamKafkaOutputTransform.java    From hop with Apache License 2.0 4 votes vote down vote up
@Override public PDone expand( PCollection<HopRow> input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamHop.init( transformPluginClasses, xpPluginClasses );

      // Inflate the metadata on the node where this is running...
      //
      IRowMeta rowMeta = JsonRowMeta.fromJson( rowMetaJson );

      int keyIndex = rowMeta.indexOfValue( keyField );
      if (keyIndex<0) {
        throw new HopException( "Unable to find key field "+keyField+" in input row: "+rowMeta.toString() );
      }
      int messageIndex = rowMeta.indexOfValue( messageField );
      if (messageIndex<0) {
        throw new HopException( "Unable to find message field "+messageField+" in input row: "+rowMeta.toString() );
      }

      // First convert the input stream of HopRows to KV<String,String> for the keys and messages
      //
      HopRowToKVStringStringFn hopRowToKVStringStringFn = new HopRowToKVStringStringFn( transformName, keyIndex, messageIndex, rowMetaJson, transformPluginClasses, xpPluginClasses );

      // Then write to Kafka topic
      //
      KafkaIO.Write<String, String> stringsToKafka = KafkaIO.<String, String>write()
        .withBootstrapServers( bootstrapServers )
        .withTopic( topic )
        .withKeySerializer( StringSerializer.class )
        .withValueSerializer( StringSerializer.class );
      // TODO: add features like compression
      //

      PCollection<KV<String, String>> kvpCollection = input.apply( ParDo.of( hopRowToKVStringStringFn ) );
      return kvpCollection.apply( stringsToKafka );
    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in Beam Kafka output transform", e );
      throw new RuntimeException( "Error in Beam Kafka output transform", e );
    }
  }
 
Example #13
Source File: KafkaInputPTransformRuntime.java    From components with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin pBegin) {

    KafkaIO.Read<byte[], byte[]> kafkaRead = KafkaIO
            .<byte[], byte[]> read()
            .withBootstrapServers(properties.getDatasetProperties().getDatastoreProperties().brokers.getValue())
            .withTopics(Arrays.asList(new String[] { properties.getDatasetProperties().topic.getValue() }))
            .updateConsumerProperties(KafkaConnection.createInputMaps(properties))
            .withKeyDeserializer(ByteArrayDeserializer.class)
            .withValueDeserializer(ByteArrayDeserializer.class);

    if (properties.useMaxReadTime.getValue()) {
        kafkaRead = kafkaRead.withMaxReadTime(new Duration(properties.maxReadTime.getValue()));
    }
    if (properties.useMaxNumRecords.getValue()) {
        kafkaRead = kafkaRead.withMaxNumRecords(properties.maxNumRecords.getValue());
    }
    // only consider value of kafkaRecord no matter which format selected
    PCollection<byte[]> kafkaRecords = pBegin
            .apply(kafkaRead) //
            .apply(ParDo.of(new ExtractRecord())) //
            .apply(Values.<byte[]> create());
    switch (properties.getDatasetProperties().valueFormat.getValue()) {
    case AVRO: {
        Schema schema = null;
        if (properties.getDatasetProperties().isHierarchy.getValue()) {
            // use component's schema directly? should be done on design time, no?
            schema = new Schema.Parser().parse(properties.getDatasetProperties().avroSchema.getValue());
        } else {
            // use component's schema directly as we are avro natural
            schema = properties.getDatasetProperties().main.schema.getValue();
        }
        return kafkaRecords.apply(ParDo.of(new ConvertToAvro(schema.toString())));
    }
    case CSV: {
        // FIXME(bchen) KafkaAvroRegistry do not have way to record adaptation, it infer schema by the data rather
        // than use the defined schema
        return kafkaRecords
                .apply(ParDo.of(new ExtractCsvSplit(properties.getDatasetProperties().getFieldDelimiter())))
                .apply(ConvertToIndexedRecord.<String[]>of());
    }
    default:
        throw new RuntimeException(
                "To be implemented: " + properties.getDatasetProperties().valueFormat.getValue());
    }

}
 
Example #14
Source File: KafkaToBigQuery.java    From java-docs-samples with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}
 
Example #15
Source File: SparkRunnerDebuggerTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void debugStreamingPipeline() {
  TestSparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setForceStreaming(true);
  options.setRunner(SparkRunnerDebugger.class);

  Pipeline pipeline = Pipeline.create(options);

  KafkaIO.Read<String, String> read =
      KafkaIO.<String, String>read()
          .withBootstrapServers("mykafka:9092")
          .withTopics(Collections.singletonList("my_input_topic"))
          .withKeyDeserializer(StringDeserializer.class)
          .withValueDeserializer(StringDeserializer.class);

  KafkaIO.Write<String, String> write =
      KafkaIO.<String, String>write()
          .withBootstrapServers("myotherkafka:9092")
          .withTopic("my_output_topic")
          .withKeySerializer(StringSerializer.class)
          .withValueSerializer(StringSerializer.class);

  KvCoder<String, String> stringKvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());

  pipeline
      .apply(read.withoutMetadata())
      .setCoder(stringKvCoder)
      .apply(Window.into(FixedWindows.of(Duration.standardSeconds(5))))
      .apply(ParDo.of(new SparkRunnerDebuggerTest.FormatKVFn()))
      .apply(Distinct.create())
      .apply(WithKeys.of(new SparkRunnerDebuggerTest.ArbitraryKeyFunction()))
      .apply(write);

  final String expectedPipeline =
      "KafkaUtils.createDirectStream(...)\n"
          + "_.map(new org.apache.beam.sdk.transforms.windowing.FixedWindows())\n"
          + "_.mapPartitions(new org.apache.beam.runners.spark."
          + "SparkRunnerDebuggerTest$FormatKVFn())\n"
          + "_.mapPartitions(new org.apache.beam.sdk.transforms.Contextful())\n"
          + "_.groupByKey()\n"
          + "_.map(new org.apache.beam.sdk.transforms.Combine$IterableCombineFn())\n"
          + "_.mapPartitions(new org.apache.beam.sdk.transforms.Distinct$3())\n"
          + "_.mapPartitions(new org.apache.beam.sdk.transforms.Contextful())\n"
          + "_.<org.apache.beam.sdk.io.kafka.AutoValue_KafkaIO_Write>";

  SparkRunnerDebugger.DebugSparkPipelineResult result =
      (SparkRunnerDebugger.DebugSparkPipelineResult) pipeline.run();

  assertThat(
      "Debug pipeline did not equal expected",
      result.getDebugString(),
      Matchers.equalTo(expectedPipeline));
}
 
Example #16
Source File: ResumeFromCheckpointStreamingTest.java    From beam with Apache License 2.0 4 votes vote down vote up
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
  KafkaIO.Read<String, Instant> read =
      KafkaIO.<String, Instant>read()
          .withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList())
          .withTopics(Collections.singletonList(TOPIC))
          .withKeyDeserializer(StringDeserializer.class)
          .withValueDeserializer(InstantDeserializer.class)
          .withConsumerConfigUpdates(ImmutableMap.of("auto.offset.reset", "earliest"))
          .withTimestampFn(KV::getValue)
          .withWatermarkFn(
              kv -> {
                // at EOF move WM to infinity.
                String key = kv.getKey();
                Instant instant = kv.getValue();
                return "EOF".equals(key) ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
              });

  TestSparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setSparkMaster("local[*]");
  options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
  options.setExpectedAssertions(expectedAssertions);
  options.setRunner(TestSparkRunner.class);
  options.setEnableSparkMetricSinks(false);
  options.setForceStreaming(true);
  options.setCheckpointDir(temporaryFolder.getRoot().getPath());
  // timeout is per execution so it can be injected by the caller.
  if (stopWatermarkOption.isPresent()) {
    options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
  }

  Pipeline p = Pipeline.create(options);

  PCollection<String> expectedCol =
      p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
  PCollectionView<List<String>> view = expectedCol.apply(View.asList());

  PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());

  PCollection<Iterable<String>> grouped =
      kafkaStream
          .apply(Keys.create())
          .apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view))
          .apply(
              Window.<String>into(FixedWindows.of(Duration.millis(500)))
                  .triggering(AfterWatermark.pastEndOfWindow())
                  .accumulatingFiredPanes()
                  .withAllowedLateness(Duration.ZERO))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create());

  grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));

  return (SparkPipelineResult) p.run();
}
 
Example #17
Source File: KafkaCSVTestTable.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
KafkaIO.Read<byte[], byte[]> createKafkaRead() {
  return super.createKafkaRead().withConsumerFactoryFn(this::mkMockConsumer);
}
 
Example #18
Source File: KafkaToGCS.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs a pipeline which reads message from Kafka and writes it to GCS.
 *
 * @param options arguments to the pipeline
 */
public static PipelineResult run(KafkaToGCSOptions options) {

  String outputFileFormat = options.getOutputFileFormat().toUpperCase();
  LOG.info("Requested File Format is " + outputFileFormat);

  final String errorMessage =
      "Invalid output format:"
          + outputFileFormat
          + ". Supported output formats:"
          + FileFormatFactory.EXPECTED_FILE_FORMAT;

  // Call the function to check File Format passed by user is valid.
  if (!WriteToGCSUtility.isValidFileFormat(outputFileFormat)) {
    LOG.info(errorMessage);
    throw new IllegalArgumentException(errorMessage);
  }

  List<String> topicsList = new ArrayList<>(Arrays.asList(options.getInputTopics().split(",")));

  checkArgument(
      topicsList.size() > 0 && topicsList.get(0).length() > 0,
      "inputTopics cannot be an empty string. ");

  // Create the pipeline
  Pipeline pipeline = Pipeline.create(options);

  /**
   * Steps: 1) Read messages in from Kafka. 2) Window the messages into minute intervals specified
   * by the executor. 3) Write To GCS in user defined format.
   */
  PCollection<KV<String, String>> records =
      pipeline
          /*
           * Step #1: Read messages in from Kafka using {@link KafkaIO} and create a PCollection
           * of KV<String, String>.
           */
          .apply(
              "Read From Kafka",
              KafkaIO.<String, String>read()
                  .withConsumerConfigUpdates(
                      ImmutableMap.of(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"))
                  .withBootstrapServers(options.getBootstrapServers())
                  .withTopics(topicsList)
                  .withKeyDeserializerAndCoder(
                      StringDeserializer.class, NullableCoder.of(StringUtf8Coder.of()))
                  .withValueDeserializerAndCoder(
                      StringDeserializer.class, NullableCoder.of(StringUtf8Coder.of()))
                  .withoutMetadata())
          /* Step #2: Window the messages into minute intervals specified by the executor. */
          .apply(
              "Creating " + options.getWindowDuration() + " Window",
              Window.into(
                  FixedWindows.of(DurationUtils.parseDuration(options.getWindowDuration()))));

  /* Step #3: Write To GCS in user defined format using the {@link FileFormatFactory}. */
  records.apply("Write To GCS", FileFormatFactory.newBuilder().setOptions(options).build());

  return pipeline.run();
}
 
Example #19
Source File: BeamKafkaInputTransform.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
@Override public PCollection<KettleRow> expand( PBegin input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamKettle.init( stepPluginClasses, xpPluginClasses );

    // What's the list of topics?
    //
    List<String> topicList = new ArrayList<>();
    for ( String topic : topics.split( "," ) ) {
      topicList.add( Const.trim( topic ) );
    }

    // TODO: add custom configuration options to this map:
    Map<String, Object> consumerConfigUpdates = new HashMap<>(  );
    consumerConfigUpdates.put( "group.id", groupId );
    for (ConfigOption configOption : configOptions) {
      Object value;
      String optionValue = configOption.getValue();
      switch(configOption.getType()) {
        case String:value=optionValue; break;
        case Short: value=Short.valueOf( optionValue ); break;
        case Int: value = Integer.valueOf( optionValue ); break;
        case Long: value = Long.valueOf( optionValue ); break;
        case Double: value = Double.valueOf( optionValue ); break;
        case Boolean: value = Boolean.valueOf( optionValue ); break;
        default:
          throw new RuntimeException( "Config option parameter "+configOption.getParameter()+" uses unsupported type "+configOption.getType().name() );
      }
      consumerConfigUpdates.put(configOption.getParameter(), value);
    }

    KafkaIO.Read<String, String> io = KafkaIO.<String, String>read()
      .withBootstrapServers( bootstrapServers )
      .withConsumerConfigUpdates( consumerConfigUpdates )
      .withTopics( topicList )
      .withKeyDeserializer( StringDeserializer.class )
      .withValueDeserializer( StringDeserializer.class );

    if (usingProcessingTime) {
      io = io.withProcessingTime();
    }
    if (usingLogAppendTime) {
      io = io.withLogAppendTime();
    }
    if (usingCreateTime) {
      io = io.withCreateTime( Duration.ZERO ); // TODO Configure this
    }

    if (restrictedToCommitted) {
      io = io.withReadCommitted();
    }
    if (allowingCommitOnConsumedOffset) {
      io = io.commitOffsetsInFinalize();
    }

    // Read keys and values from Kafka
    //
    PCollection<KV<String, String>> kafkaConsumerOutput = input.apply( io.withoutMetadata() );

    // Now convert this into Kettle rows with a single String value in them
    //
    PCollection<KettleRow> output = kafkaConsumerOutput.apply(
      ParDo.of(new KVStringStringToKettleRowFn( stepname, rowMetaJson, stepPluginClasses, xpPluginClasses ))
    );

    return output;

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in Kafka input transform", e );
    throw new RuntimeException( "Error in Kafka input transform", e );
  }
}
 
Example #20
Source File: BeamKafkaOutputTransform.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
@Override public PDone expand( PCollection<KettleRow> input ) {

    try {
      // Only initialize once on this node/vm
      //
      BeamKettle.init( stepPluginClasses, xpPluginClasses );

      // Inflate the metadata on the node where this is running...
      //
      RowMetaInterface rowMeta = JsonRowMeta.fromJson( rowMetaJson );

      int keyIndex = rowMeta.indexOfValue( keyField );
      if (keyIndex<0) {
        throw new KettleException( "Unable to find key field "+keyField+" in input row: "+rowMeta.toString() );
      }
      int messageIndex = rowMeta.indexOfValue( messageField );
      if (messageIndex<0) {
        throw new KettleException( "Unable to find message field "+messageField+" in input row: "+rowMeta.toString() );
      }

      // First convert the input stream of KettleRows to KV<String,String> for the keys and messages
      //
      KettleRowToKVStringStringFn kettleRowToKVStringStringFn = new KettleRowToKVStringStringFn( stepname, keyIndex, messageIndex, rowMetaJson, stepPluginClasses, xpPluginClasses );

      // Then write to Kafka topic
      //
      KafkaIO.Write<String, String> stringsToKafka = KafkaIO.<String, String>write()
        .withBootstrapServers( bootstrapServers )
        .withTopic( topic )
        .withKeySerializer( StringSerializer.class )
        .withValueSerializer( StringSerializer.class );
      // TODO: add features like compression
      //

      PCollection<KV<String, String>> kvpCollection = input.apply( ParDo.of( kettleRowToKVStringStringFn ) );
      return kvpCollection.apply( stringsToKafka );
    } catch ( Exception e ) {
      numErrors.inc();
      LOG.error( "Error in Beam Kafka output transform", e );
      throw new RuntimeException( "Error in Beam Kafka output transform", e );
    }
  }
 
Example #21
Source File: BeamKafkaInputTransform.java    From hop with Apache License 2.0 4 votes vote down vote up
@Override public PCollection<HopRow> expand( PBegin input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamHop.init( transformPluginClasses, xpPluginClasses );

    // What's the list of topics?
    //
    List<String> topicList = new ArrayList<>();
    for ( String topic : topics.split( "," ) ) {
      topicList.add( Const.trim( topic ) );
    }

    // TODO: add custom configuration options to this map:
    Map<String, Object> consumerConfigUpdates = new HashMap<>(  );
    consumerConfigUpdates.put( "group.id", groupId );
    for (ConfigOption configOption : configOptions) {
      Object value;
      String optionValue = configOption.getValue();
      switch(configOption.getType()) {
        case String:value=optionValue; break;
        case Short: value=Short.valueOf( optionValue ); break;
        case Int: value = Integer.valueOf( optionValue ); break;
        case Long: value = Long.valueOf( optionValue ); break;
        case Double: value = Double.valueOf( optionValue ); break;
        case Boolean: value = Boolean.valueOf( optionValue ); break;
        default:
          throw new RuntimeException( "Config option parameter "+configOption.getParameter()+" uses unsupported type "+configOption.getType().name() );
      }
      consumerConfigUpdates.put(configOption.getParameter(), value);
    }

    KafkaIO.Read<String, String> io = KafkaIO.<String, String>read()
      .withBootstrapServers( bootstrapServers )
      .withConsumerConfigUpdates( consumerConfigUpdates )
      .withTopics( topicList )
      .withKeyDeserializer( StringDeserializer.class )
      .withValueDeserializer( StringDeserializer.class );

    if (usingProcessingTime) {
      io = io.withProcessingTime();
    }
    if (usingLogAppendTime) {
      io = io.withLogAppendTime();
    }
    if (usingCreateTime) {
      io = io.withCreateTime( Duration.ZERO ); // TODO Configure this
    }

    if (restrictedToCommitted) {
      io = io.withReadCommitted();
    }
    if (allowingCommitOnConsumedOffset) {
      io = io.commitOffsetsInFinalize();
    }

    // Read keys and values from Kafka
    //
    PCollection<KV<String, String>> kafkaConsumerOutput = input.apply( io.withoutMetadata() );

    // Now convert this into Hop rows with a single String value in them
    //
    PCollection<HopRow> output = kafkaConsumerOutput.apply(
      ParDo.of(new KVStringStringToHopRowFn( transformName, rowMetaJson, transformPluginClasses, xpPluginClasses ))
    );

    return output;

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in Kafka input transform", e );
    throw new RuntimeException( "Error in Kafka input transform", e );
  }
}