com.google.cloud.dataflow.sdk.values.PCollection Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.values.PCollection. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
@org.junit.Test
public void testCompleteWindowData() {

  Pipeline pipeline = setup();

  List<KV<String, TSProto>> pipelineData = GenerateSampleData.getTestData();
  List<KV<String, TSProto>> testData = new ArrayList<KV<String, TSProto>>(pipelineData);
  WorkPacketConfig packetConfig = GenerateSampleData.generateWorkPacketConfig(2);

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, pipelineData, packetConfig);

  testData.add(KV.of(GenerateSampleData.TS3, TSProto.newBuilder().setKey(GenerateSampleData.TS3)
          .setIsLive(false).setTime(1451577839999L).build()));
  testData.add(KV.of(GenerateSampleData.TS4, TSProto.newBuilder().setKey(GenerateSampleData.TS4)
          .setIsLive(false).setTime(1451577839999L).build()));
  
  DataflowAssert.that(completeWindowData).containsInAnyOrder(testData);
  pipeline.run();
}
 
Example #2
Source File: UnboundedSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}
 
Example #3
Source File: ExportedServiceAccountKeyRemover.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}
 
Example #4
Source File: LoadBooks.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}
 
Example #5
Source File: WordCountJoin2ITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #6
Source File: RemoveDuplicatesEmptyITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #7
Source File: LoadBooksTest.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
@Test
public void parseBooks_returnsNgramsCounts() {
  // Arrange
  Pipeline p = TestPipeline.create();
  PCollection<String> input = p.apply(Create.of(testFile));

  // Act
  PCollection<KV<String, Integer>> output = LoadBooks.applyPipelineToParseBooks(input);

  // Assert
  DataflowAssert.that(output)
      .containsInAnyOrder(
          KV.of("despatch when art", 10),
          KV.of("despatch when came", 10),
          KV.of("despatch when published", 12),
          KV.of("despatch where was", 10),
          KV.of("despatch which made", 45),
          // There are two entries for "despatch which addressed".
          // Each entry has a different part of speech for "addressed".
          KV.of("despatch which addressed", 12 + 46),
          KV.of("despatch which admitted", 13),
          KV.of("despatch which allow", 14),
          KV.of("despatch which announced", 50),
          KV.of("despatch which answer", 32));
}
 
Example #8
Source File: TfIdfITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}
 
Example #9
Source File: RemoveDuplicatesITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #10
Source File: ReadSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}
 
Example #11
Source File: FlinkStreamingTransformTranslators.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(ParDo.Bound<IN, OUT> transform, FlinkStreamingTranslationContext context) {
	PCollection<OUT> output = context.getOutput(transform);

	final WindowingStrategy<OUT, ? extends BoundedWindow> windowingStrategy =
			(WindowingStrategy<OUT, ? extends BoundedWindow>)
					context.getOutput(transform).getWindowingStrategy();

	WindowedValue.WindowedValueCoder<OUT> outputStreamCoder = WindowedValue.getFullCoder(output.getCoder(),
			windowingStrategy.getWindowFn().windowCoder());
	CoderTypeInformation<WindowedValue<OUT>> outputWindowedValueCoder =
			new CoderTypeInformation<>(outputStreamCoder);

	FlinkParDoBoundWrapper<IN, OUT> doFnWrapper = new FlinkParDoBoundWrapper<>(
			context.getPipelineOptions(), windowingStrategy, transform.getFn());
	DataStream<WindowedValue<IN>> inputDataStream = context.getInputDataStream(context.getInput(transform));
	SingleOutputStreamOperator<WindowedValue<OUT>> outDataStream = inputDataStream.flatMap(doFnWrapper)
			.returns(outputWindowedValueCoder);

	context.setOutputDataStream(context.getOutput(transform), outDataStream);
}
 
Example #12
Source File: FlinkStreamingTransformTranslators.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(Read.Unbounded<T> transform, FlinkStreamingTranslationContext context) {
	PCollection<T> output = context.getOutput(transform);

	DataStream<WindowedValue<T>> source;
	if (transform.getSource().getClass().equals(UnboundedFlinkSource.class)) {
		UnboundedFlinkSource flinkSource = (UnboundedFlinkSource) transform.getSource();
		source = context.getExecutionEnvironment()
				.addSource(flinkSource.getFlinkSource())
				.flatMap(new FlatMapFunction<String, WindowedValue<String>>() {
					@Override
					public void flatMap(String s, Collector<WindowedValue<String>> collector) throws Exception {
						collector.collect(WindowedValue.<String>of(s, Instant.now(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING));
					}
				}).assignTimestampsAndWatermarks(new IngestionTimeExtractor());
	} else {
		source = context.getExecutionEnvironment()
				.addSource(new UnboundedSourceWrapper<>(context.getPipelineOptions(), transform));
	}
	context.setOutputDataStream(output, source);
}
 
Example #13
Source File: FXTimeSeriesPipelineSRGTests.java    From data-timeseries-java with Apache License 2.0 6 votes vote down vote up
public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}
 
Example #14
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}
 
Example #15
Source File: FlinkStreamingTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(Create.Values<OUT> transform, FlinkStreamingTranslationContext context) {
	PCollection<OUT> output = context.getOutput(transform);
	Iterable<OUT> elements = transform.getElements();

	// we need to serialize the elements to byte arrays, since they might contain
	// elements that are not serializable by Java serialization. We deserialize them
	// in the FlatMap function using the Coder.

	List<byte[]> serializedElements = Lists.newArrayList();
	Coder<OUT> elementCoder = context.getOutput(transform).getCoder();
	for (OUT element: elements) {
		ByteArrayOutputStream bao = new ByteArrayOutputStream();
		try {
			elementCoder.encode(element, bao, Coder.Context.OUTER);
			serializedElements.add(bao.toByteArray());
		} catch (IOException e) {
			throw new RuntimeException("Could not serialize Create elements using Coder: " + e);
		}
	}


	DataStream<Integer> initDataSet = context.getExecutionEnvironment().fromElements(1);

	FlinkStreamingCreateFunction<Integer, OUT> createFunction =
			new FlinkStreamingCreateFunction<>(serializedElements, elementCoder);

	WindowedValue.ValueOnlyWindowedValueCoder<OUT> windowCoder = WindowedValue.getValueOnlyCoder(elementCoder);
	TypeInformation<WindowedValue<OUT>> outputType = new CoderTypeInformation<>(windowCoder);

	DataStream<WindowedValue<OUT>> outputDataStream = initDataSet.flatMap(createFunction)
			.returns(outputType);

	context.setOutputDataStream(context.getOutput(transform), outputDataStream);
}
 
Example #16
Source File: FlinkStreamingTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(Flatten.FlattenPCollectionList<T> transform, FlinkStreamingTranslationContext context) {
	List<PCollection<T>> allInputs = context.getInput(transform).getAll();
	DataStream<T> result = null;
	for (PCollection<T> collection : allInputs) {
		DataStream<T> current = context.getInputDataStream(collection);
		result = (result == null) ? current : result.union(current);
	}
	context.setOutputDataStream(context.getOutput(transform), result);
}
 
Example #17
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(Flatten.FlattenPCollectionList<T> transform, FlinkBatchTranslationContext context) {
	List<PCollection<T>> allInputs = context.getInput(transform).getAll();
	DataSet<T> result = null;
	for(PCollection<T> collection : allInputs) {
		DataSet<T> current = context.getInputDataSet(collection);
		if (result == null) {
			result = current;
		} else {
			result = result.union(current);
		}
	}
	context.setOutputDataSet(context.getOutput(transform), result);
}
 
Example #18
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(CoGroupByKey<K> transform, FlinkBatchTranslationContext context) {
	KeyedPCollectionTuple<K> input = context.getInput(transform);

	CoGbkResultSchema schema = input.getCoGbkResultSchema();
	List<KeyedPCollectionTuple.TaggedKeyedPCollection<K, ?>> keyedCollections = input.getKeyedCollections();

	KeyedPCollectionTuple.TaggedKeyedPCollection<K, ?> taggedCollection1 = keyedCollections.get(0);
	KeyedPCollectionTuple.TaggedKeyedPCollection<K, ?> taggedCollection2 = keyedCollections.get(1);

	TupleTag<?> tupleTag1 = taggedCollection1.getTupleTag();
	TupleTag<?> tupleTag2 = taggedCollection2.getTupleTag();

	PCollection<? extends KV<K, ?>> collection1 = taggedCollection1.getCollection();
	PCollection<? extends KV<K, ?>> collection2 = taggedCollection2.getCollection();

	DataSet<KV<K,V1>> inputDataSet1 = context.getInputDataSet(collection1);
	DataSet<KV<K,V2>> inputDataSet2 = context.getInputDataSet(collection2);

	TypeInformation<KV<K,CoGbkResult>> typeInfo = context.getOutputTypeInfo();

	FlinkCoGroupKeyedListAggregator<K,V1,V2> aggregator = new FlinkCoGroupKeyedListAggregator<>(schema, tupleTag1, tupleTag2);

	Keys.ExpressionKeys<KV<K,V1>> keySelector1 = new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet1.getType());
	Keys.ExpressionKeys<KV<K,V2>> keySelector2 = new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet2.getType());

	DataSet<KV<K, CoGbkResult>> out = new CoGroupOperator<>(inputDataSet1, inputDataSet2,
															keySelector1, keySelector2,
			                                                aggregator, typeInfo, null, transform.getName());
	context.setOutputDataSet(context.getOutput(transform), out);
}
 
Example #19
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}
 
Example #20
Source File: WordCountJoin3ITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #21
Source File: WordCountITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #22
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(Read.Bounded<T> transform, FlinkBatchTranslationContext context) {
	String name = transform.getName();
	BoundedSource<T> source = transform.getSource();
	PCollection<T> output = context.getOutput(transform);
	Coder<T> coder = output.getCoder();

	TypeInformation<T> typeInformation = context.getTypeInfo(output);

	DataSource<T> dataSource = new DataSource<>(context.getExecutionEnvironment(),
			new SourceInputFormat<>(source, context.getPipelineOptions()), typeInformation, name);

	context.setOutputDataSet(output, dataSource);
}
 
Example #23
Source File: AutoComplete.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionList<KV<String, List<CompletionCandidate>>> apply(
      PCollection<CompletionCandidate> input) {
    if (minPrefix > 10) {
      // Base case, partitioning to return the output in the expected format.
      return input
        .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
        .apply(Partition.of(2, new KeySizePartitionFn()));
    } else {
      // If a candidate is in the top N for prefix a...b, it must also be in the top
      // N for a...bX for every X, which is typlically a much smaller set to consider.
      // First, compute the top candidate for prefixes of size at least minPrefix + 1.
      PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
        .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
      // Consider the top candidates for each prefix of length minPrefix + 1...
      PCollection<KV<String, List<CompletionCandidate>>> small =
        PCollectionList
        .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
        // ...together with those (previously excluded) candidates of length
        // exactly minPrefix...
        .and(input.apply(Filter.by(new SerializableFunction<CompletionCandidate, Boolean>() {
                private static final long serialVersionUID = 0;

                @Override
                public Boolean apply(CompletionCandidate c) {
                  return c.getValue().length() == minPrefix;
                }
              })))
        .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
        // ...set the key to be the minPrefix-length prefix...
        .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
        // ...and (re)apply the Top operator to all of them together.
        .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));

      PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
          .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());

      return PCollectionList.of(flattenLarger).and(small);
    }
}
 
Example #24
Source File: AutoComplete.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, List<CompletionCandidate>>> apply(
    PCollection<CompletionCandidate> input) {
  return input
    // For each completion candidate, map it to all prefixes.
    .apply(ParDo.of(new AllPrefixes(minPrefix)))

    // Find and return the top candiates for each prefix.
    .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
         .withHotKeyFanout(new HotKeyFanout()));
}
 
Example #25
Source File: AutoComplete.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, List<CompletionCandidate>>> apply(PCollection<String> input) {
  PCollection<CompletionCandidate> candidates = input
    // First count how often each token appears.
    .apply(new Count.PerElement<String>())

    // Map the KV outputs of Count into our own CompletionCandiate class.
    .apply(ParDo.named("CreateCompletionCandidates").of(
        new DoFn<KV<String, Long>, CompletionCandidate>() {
          private static final long serialVersionUID = 0;

          @Override
          public void processElement(ProcessContext c) {
            CompletionCandidate cand = new CompletionCandidate(c.element().getKey(), c.element().getValue());
            c.output(cand);
          }
        }));

  // Compute the top via either a flat or recursive algorithm.
  if (recursive) {
    return candidates
      .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
      .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
  } else {
    return candidates
      .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
  }
}
 
Example #26
Source File: WindowedWordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
	StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(StreamingWordCountOptions.class);
	options.setStreaming(true);
	options.setWindowSize(10L);
	options.setSlide(5L);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize() +
			" sec. and a slide of " + options.getSlide());

	Pipeline pipeline = Pipeline.create(options);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(SlidingWindows.of(Duration.standardSeconds(options.getWindowSize()))
					.every(Duration.standardSeconds(options.getSlide())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputWordCount.txt"));

	pipeline.run();
}
 
Example #27
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}
 
Example #28
Source File: KafkaWindowedWordCountExample.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}
 
Example #29
Source File: WordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<String, Long>> apply(PCollection<String> lines) {

	// Convert lines of text into individual words.
	PCollection<String> words = lines.apply(
			ParDo.of(new ExtractWordsFn()));

	// Count the number of times each word occurs.
	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	return wordCounts;
}
 
Example #30
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<URI, String>> apply(PInput input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
			PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	//  - file: URIs and paths locally
	//  - gs: URIs on the service
	for (final URI uri : uris) {
		String uriString;
		if (uri.getScheme().equals("file")) {
			uriString = new File(uri).getPath();
		} else {
			uriString = uri.toString();
		}

		PCollection<KV<URI, String>> oneUriToLines = pipeline
				.apply(TextIO.Read.from(uriString)
						.named("TextIO.Read(" + uriString + ")"))
				.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

		urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
}