com.google.cloud.dataflow.sdk.io.TextIO Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.io.TextIO. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SideInputITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #2
Source File: LoadBooks.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}
 
Example #3
Source File: UnboundedSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}
 
Example #4
Source File: RemoveDuplicatesEmptyITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #5
Source File: WordCountJoin2ITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #6
Source File: TfIdfITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}
 
Example #7
Source File: UserManagedKeysApp.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
@Override
public void doGet(HttpServletRequest req, HttpServletResponse resp)
    throws IOException {
  PrintWriter out = resp.getWriter();

  Preconditions.checkNotNull(Constants.ORG_ID);
  Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
  Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);

  PipelineOptions options;
  if (CloudUtil.willExecuteOnCloud()) {
    options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
  } else {
    options = getLocalExecutionOptions();
  }

  new ExportedServiceAccountKeyRemover(options, Constants.ORG_ID)
      .attachSink(TextIO.Write.named("Write output messages").to(Constants.OUTPUT_PREFIX))
      .run();
  out.println("Test passed! The output was written to GCS");
}
 
Example #8
Source File: RemoveDuplicatesITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #9
Source File: ReadSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}
 
Example #10
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(TextIO.Write.Bound<T> transform, FlinkBatchTranslationContext context) {
	PValue input = context.getInput(transform);
	DataSet<T> inputDataSet = context.getInputDataSet(input);

	String filenamePrefix = transform.getFilenamePrefix();
	String filenameSuffix = transform.getFilenameSuffix();
	boolean needsValidation = transform.needsValidation();
	int numShards = transform.getNumShards();
	String shardNameTemplate = transform.getShardNameTemplate();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.Write.needsValidation not yet supported. Is: {}.", needsValidation);
	LOG.warn("Translation of TextIO.Write.filenameSuffix not yet supported. Is: {}.", filenameSuffix);
	LOG.warn("Translation of TextIO.Write.shardNameTemplate not yet supported. Is: {}.", shardNameTemplate);

	//inputDataSet.print();
	DataSink<T> dataSink = inputDataSet.writeAsText(filenamePrefix);

	if (numShards > 0) {
		dataSink.setParallelism(numShards);
	}
}
 
Example #11
Source File: LiveStateCheckerRunner.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
/**
 * Main function for the runner.
 * @param args The args this program was called with.
 * @throws IOException Thrown if there's an error reading from one of the APIs.
 */
public static void main(String[] args) throws IOException {
  Preconditions.checkNotNull(Constants.ORG_NAME);
  Preconditions.checkNotNull(Constants.POLICY_BUCKET);
  Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
  Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);
  GCSFilesSource source = null;
  try {
    source = new GCSFilesSource(Constants.POLICY_BUCKET, Constants.ORG_NAME);
  } catch (GeneralSecurityException e) {
    throw new IOException("SecurityException: Cannot create GCSFileSource");
  }
  PipelineOptions options;
  if (CloudUtil.willExecuteOnCloud()) {
    options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
  } else {
    options = getLocalExecutionOptions();
  }
  new OnDemandLiveStateChecker(options, source)
      .attachSink(TextIO.Write.named("Write messages to GCS").to(Constants.OUTPUT_PREFIX))
      .run();
}
 
Example #12
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(TextIO.Read.Bound<String> transform, FlinkBatchTranslationContext context) {
	String path = transform.getFilepattern();
	String name = transform.getName();

	TextIO.CompressionType compressionType = transform.getCompressionType();
	boolean needsValidation = transform.needsValidation();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.CompressionType not yet supported. Is: {}.", compressionType);
	LOG.warn("Translation of TextIO.Read.needsValidation not yet supported. Is: {}.", needsValidation);

	PValue output = context.getOutput(transform);

	TypeInformation<String> typeInformation = context.getTypeInfo(output);
	DataSource<String> source = new DataSource<>(context.getExecutionEnvironment(), new TextInputFormat(new Path(path)), typeInformation, name);

	context.setOutputDataSet(output, source);
}
 
Example #13
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}
 
Example #14
Source File: MaybeEmptyTestITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}
 
Example #15
Source File: WordCountJoin3ITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}
 
Example #16
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}
 
Example #17
Source File: WordCountITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #18
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<URI, String>> apply(PInput input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
			PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	//  - file: URIs and paths locally
	//  - gs: URIs on the service
	for (final URI uri : uris) {
		String uriString;
		if (uri.getScheme().equals("file")) {
			uriString = new File(uri).getPath();
		} else {
			uriString = uri.toString();
		}

		PCollection<KV<URI, String>> oneUriToLines = pipeline
				.apply(TextIO.Read.from(uriString)
						.named("TextIO.Read(" + uriString + ")"))
				.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

		urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
}
 
Example #19
Source File: WordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

		Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
				.as(Options.class);
		options.setRunner(FlinkPipelineRunner.class);

		Pipeline p = Pipeline.create(options);

		p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
				.apply(new CountWords())
				.apply(MapElements.via(new FormatAsTextFn()))
				.apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));

		p.run();
	}
 
Example #20
Source File: AvroITCase.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #21
Source File: KafkaWindowedWordCountExample.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}
 
Example #22
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}
 
Example #23
Source File: FlinkStreamingTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(TextIO.Write.Bound<T> transform, FlinkStreamingTranslationContext context) {
	PValue input = context.getInput(transform);
	DataStream<WindowedValue<T>> inputDataStream = context.getInputDataStream(input);

	String filenamePrefix = transform.getFilenamePrefix();
	String filenameSuffix = transform.getFilenameSuffix();
	boolean needsValidation = transform.needsValidation();
	int numShards = transform.getNumShards();
	String shardNameTemplate = transform.getShardNameTemplate();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.Write.needsValidation not yet supported. Is: {}.", needsValidation);
	LOG.warn("Translation of TextIO.Write.filenameSuffix not yet supported. Is: {}.", filenameSuffix);
	LOG.warn("Translation of TextIO.Write.shardNameTemplate not yet supported. Is: {}.", shardNameTemplate);

	DataStream<String> dataSink = inputDataStream.flatMap(new FlatMapFunction<WindowedValue<T>, String>() {
		@Override
		public void flatMap(WindowedValue<T> value, Collector<String> out) throws Exception {
			out.collect(value.getValue().toString());
		}
	});
	DataStreamSink<String> output = dataSink.writeAsText(filenamePrefix, FileSystem.WriteMode.OVERWRITE);

	if (numShards > 0) {
		output.setParallelism(numShards);
	}
}
 
Example #24
Source File: FlattenizeITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}
 
Example #25
Source File: ParDoMultiOutputITCase.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}
 
Example #26
Source File: DesiredStateEnforcerApp.java    From policyscanner with Apache License 2.0 4 votes vote down vote up
/**
 * Handler for the GET request to this app.
 * @param req The request object.
 * @param resp The response object.
 * @throws IOException Thrown if there's an error reading from one of the APIs.
 */
@Override
public void doGet(HttpServletRequest req, HttpServletResponse resp)
    throws IOException {
  PrintWriter out = resp.getWriter();

  Preconditions.checkNotNull(Constants.ORG_NAME);
  Preconditions.checkNotNull(Constants.ORG_ID);
  Preconditions.checkNotNull(Constants.POLICY_BUCKET);
  Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
  Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);

  GCSFilesSource source = null;
  try {
    source = new GCSFilesSource(Constants.POLICY_BUCKET, Constants.ORG_NAME);
  } catch (GeneralSecurityException e) {
    throw new IOException("SecurityException: Cannot create GCSFileSource");
  }
  PipelineOptions options;
  if (CloudUtil.willExecuteOnCloud()) {
    options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
  } else {
    options = getLocalExecutionOptions();
  }
  String datetimestamp = new SimpleDateFormat(Constants.SINK_TIMESTAMP_FORMAT).format(new Date());
  DesiredStateEnforcer enforcer = null;
  try {
    enforcer = new DesiredStateEnforcer(options, source, Constants.ORG_ID)
        .attachSink(TextIO.Write
            .named("Write messages to GCS")
            .to(MessageFormat.format(Constants.SINK_NAME_FORMAT,
                new Object[]{
                    Constants.OUTPUT_PREFIX,
                    datetimestamp,
                    Constants.OUTPUT_LABEL_ENFORCER
                    })))
        .run();
    if (enforcer.getTotalEnforcedStates() < 1) {
      out.println("Finished running Enforcer! No states needed to be enforced.");
    } else {
      out.println("Finished running Enforcer! The output was written to GCS");
    }
  } catch (AggregatorRetrievalException aggRetrievalException) {
    // TODO(carise): do something better than this
    aggRetrievalException.printStackTrace();
  }
}
 
Example #27
Source File: JoinExamplesITCase.java    From flink-dataflow with Apache License 2.0 3 votes vote down vote up
@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
	PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

	PCollection<String> output = JoinExamples.joinEvents(input1, input2);

	output.apply(TextIO.Write.to(resultPath));

	p.run();
}