com.google.cloud.dataflow.sdk.Pipeline Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.Pipeline. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
@org.junit.Test
public void testCompleteWindowData() {

  Pipeline pipeline = setup();

  List<KV<String, TSProto>> pipelineData = GenerateSampleData.getTestData();
  List<KV<String, TSProto>> testData = new ArrayList<KV<String, TSProto>>(pipelineData);
  WorkPacketConfig packetConfig = GenerateSampleData.generateWorkPacketConfig(2);

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, pipelineData, packetConfig);

  testData.add(KV.of(GenerateSampleData.TS3, TSProto.newBuilder().setKey(GenerateSampleData.TS3)
          .setIsLive(false).setTime(1451577839999L).build()));
  testData.add(KV.of(GenerateSampleData.TS4, TSProto.newBuilder().setKey(GenerateSampleData.TS4)
          .setIsLive(false).setTime(1451577839999L).build()));
  
  DataflowAssert.that(completeWindowData).containsInAnyOrder(testData);
  pipeline.run();
}
 
Example #2
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}
 
Example #3
Source File: TimeSeriesCoders.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
public static void registerCoders(Pipeline pipeline) {

		LOG.debug("Register TSProto coder");
		pipeline.getCoderRegistry().registerCoder(TSProto.class, ProtoCoder.of(TSProto.class));

		LOG.debug("Register TSAggValueProto coder");
		pipeline.getCoderRegistry().registerCoder(TSAggValueProto.class, ProtoCoder.of(TSAggValueProto.class));
		LOG.debug("Register WorkPacketConfig coder");
		pipeline.getCoderRegistry().registerCoder(WorkPacketConfig.class, ProtoCoder.of(WorkPacketConfig.class));
		LOG.debug("Register WorkPacketKey coder");
		pipeline.getCoderRegistry().registerCoder(WorkPacketKey.class, ProtoCoder.of(WorkPacketKey.class));
		LOG.debug("Register WorkDataPoint coder");
		pipeline.getCoderRegistry().registerCoder(WorkDataPoint.class, ProtoCoder.of(WorkDataPoint.class));
		LOG.debug("Register WorkPartition coder");
		pipeline.getCoderRegistry().registerCoder(WorkPartition.class, ProtoCoder.of(WorkPartition.class));
		LOG.debug("Register Correlation coder");
		pipeline.getCoderRegistry().registerCoder(Correlation.class, ProtoCoder.of(Correlation.class));

	}
 
Example #4
Source File: ReadSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}
 
Example #5
Source File: RemoveDuplicatesITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #6
Source File: TaskRunner.java    From dockerflow with Apache License 2.0 6 votes vote down vote up
/** Run a Docker workflow on Dataflow. */
public static void run(Workflow w, Map<String, WorkflowArgs> a, DataflowPipelineOptions o)
    throws IOException {
  LOG.info("Running workflow graph");
  if (w.getArgs().getProjectId() == null) {
    throw new IllegalArgumentException("Project id is required");
  }

  Pipeline p = DataflowFactory.dataflow(w, a, o);

  LOG.info("Created Dataflow pipeline");
  LOG.debug(w.toString());

  PipelineResult r = p.run();

  LOG.info("Dataflow pipeline completed");
  LOG.info("Result state: " + r.getState());
}
 
Example #7
Source File: SideInputITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #8
Source File: TfIdfITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}
 
Example #9
Source File: WordCountJoin2ITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #10
Source File: RemoveDuplicatesEmptyITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #11
Source File: ExportedServiceAccountKeyRemover.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}
 
Example #12
Source File: UnboundedSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}
 
Example #13
Source File: FilterRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("filter lower Manhattan", ParDo.of(new FilterLowerManhattan()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #14
Source File: LoadBooks.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}
 
Example #15
Source File: LoadBooksTest.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
@Test
public void parseBooks_returnsNgramsCounts() {
  // Arrange
  Pipeline p = TestPipeline.create();
  PCollection<String> input = p.apply(Create.of(testFile));

  // Act
  PCollection<KV<String, Integer>> output = LoadBooks.applyPipelineToParseBooks(input);

  // Assert
  DataflowAssert.that(output)
      .containsInAnyOrder(
          KV.of("despatch when art", 10),
          KV.of("despatch when came", 10),
          KV.of("despatch when published", 12),
          KV.of("despatch where was", 10),
          KV.of("despatch which made", 45),
          // There are two entries for "despatch which addressed".
          // Each entry has a different part of speech for "addressed".
          KV.of("despatch which addressed", 12 + 46),
          KV.of("despatch which admitted", 13),
          KV.of("despatch which allow", 14),
          KV.of("despatch which announced", 50),
          KV.of("despatch which answer", 32));
}
 
Example #16
Source File: CountRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("window 1s", Window.into(FixedWindows.of(Duration.standardSeconds(1))))
   .apply("mark rides", MapElements.via(new MarkRides()))
   .apply("count similar", Count.perKey())
   .apply("format rides", MapElements.via(new TransformRides()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));

  p.run();
}
 
Example #17
Source File: CoinbaseSource.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CloudBigtableOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CloudBigtableOptions.class);

  CloudBigtableScanConfiguration config =
      new CloudBigtableScanConfiguration.Builder()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId())
          .build();

  options.setStreaming(true);
  options.setRunner(DataflowPipelineRunner.class);

  Pipeline p = Pipeline.create(options);
  CloudBigtableIO.initializeForWrite(p);

  p.apply(Read.from(new CoinbaseSource()))
      .apply(ParDo.named("DeserializeCoinbase").of(new DeserializeCoinbase()))
      .apply(ParDo.of(new HBaseBigtableWriter()))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run();
}
 
Example #18
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}
 
Example #19
Source File: WordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

		Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
				.as(Options.class);
		options.setRunner(FlinkPipelineRunner.class);

		Pipeline p = Pipeline.create(options);

		p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
				.apply(new CountWords())
				.apply(MapElements.via(new FormatAsTextFn()))
				.apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));

		p.run();
	}
 
Example #20
Source File: KafkaWindowedWordCountExample.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}
 
Example #21
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

	options.setRunner(FlinkPipelineRunner.class);

	Pipeline pipeline = Pipeline.create(options);
	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	pipeline
			.apply(new ReadDocuments(listInputDocuments(options)))
			.apply(new ComputeTfIdf())
			.apply(new WriteTfIdf(options.getOutput()));

	pipeline.run();
}
 
Example #22
Source File: WindowedWordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
	StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(StreamingWordCountOptions.class);
	options.setStreaming(true);
	options.setWindowSize(10L);
	options.setSlide(5L);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize() +
			" sec. and a slide of " + options.getSlide());

	Pipeline pipeline = Pipeline.create(options);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(SlidingWindows.of(Duration.standardSeconds(options.getWindowSize()))
					.every(Duration.standardSeconds(options.getSlide())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputWordCount.txt"));

	pipeline.run();
}
 
Example #23
Source File: AutoComplete.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);
  options.setCheckpointingInterval(1000L);
  options.setNumberOfExecutionRetries(5);
  options.setExecutionRetryDelay(3000L);
  options.setRunner(FlinkPipelineRunner.class);

  PTransform<? super PBegin, PCollection<String>> readSource =
          Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("WordStream");
  WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

  // Create the pipeline.
  Pipeline p = Pipeline.create(options);
  PCollection<KV<String, List<CompletionCandidate>>> toWrite = p
    .apply(readSource)
    .apply(ParDo.of(new ExtractWordsFn()))
    .apply(Window.<String>into(windowFn)
            .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
          .discardingFiredPanes())
    .apply(ComputeTopCompletions.top(10, options.getRecursive()));

  toWrite
    .apply(ParDo.named("FormatForPerTaskFile").of(new FormatForPerTaskLocalFile()))
    .apply(TextIO.Write.to("./outputAutoComplete.txt"));

  p.run();
}
 
Example #24
Source File: FlinkPipelineExecutionEnvironment.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
/**
 * Depending on if the job is a Streaming or a Batch one, this method creates
 * the necessary execution environment and pipeline translator, and translates
 * the {@link com.google.cloud.dataflow.sdk.values.PCollection} program into
 * a {@link org.apache.flink.api.java.DataSet} or {@link org.apache.flink.streaming.api.datastream.DataStream}
 * one.
 * */
public void translate(Pipeline pipeline) {
	checkInitializationState();
	if(this.flinkBatchEnv == null && this.flinkStreamEnv == null) {
		createPipelineExecutionEnvironment();
	}
	if (this.flinkPipelineTranslator == null) {
		createPipelineTranslator();
	}
	this.flinkPipelineTranslator.translate(pipeline);
}
 
Example #25
Source File: FlinkPipelineRunner.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public FlinkRunnerResult run(Pipeline pipeline) {
	LOG.info("Executing pipeline using FlinkPipelineRunner.");

	LOG.info("Translating pipeline to Flink program.");

	this.flinkJobEnv.translate(pipeline);

	LOG.info("Starting execution of Flink program.");
	
	JobExecutionResult result;
	try {
		result = this.flinkJobEnv.executePipeline();
	} catch (Exception e) {
		LOG.error("Pipeline execution failed", e);
		throw new RuntimeException("Pipeline execution failed", e);
	}

	LOG.info("Execution finished in {} msecs", result.getNetRuntime());

	Map<String, Object> accumulators = result.getAllAccumulatorResults();
	if (accumulators != null && !accumulators.isEmpty()) {
		LOG.info("Final aggregator values:");

		for (Map.Entry<String, Object> entry : result.getAllAccumulatorResults().entrySet()) {
			LOG.info("{} : {}", entry.getKey(), entry.getValue());
		}
	}

	return new FlinkRunnerResult(accumulators, result.getNetRuntime());
}
 
Example #26
Source File: WriteSinkITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of(EXPECTED_RESULT)).setCoder(StringUtf8Coder.of())
		.apply("CustomSink", Write.to(new MyCustomSink(resultPath)));

	p.run();
}
 
Example #27
Source File: AvroITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #28
Source File: MaybeEmptyTestITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #29
Source File: WordCountITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #30
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}