com.google.cloud.dataflow.sdk.io.Read Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.io.Read. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExportedServiceAccountKeyRemover.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}
 
Example #2
Source File: FlinkStreamingTransformTranslators.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(Read.Unbounded<T> transform, FlinkStreamingTranslationContext context) {
	PCollection<T> output = context.getOutput(transform);

	DataStream<WindowedValue<T>> source;
	if (transform.getSource().getClass().equals(UnboundedFlinkSource.class)) {
		UnboundedFlinkSource flinkSource = (UnboundedFlinkSource) transform.getSource();
		source = context.getExecutionEnvironment()
				.addSource(flinkSource.getFlinkSource())
				.flatMap(new FlatMapFunction<String, WindowedValue<String>>() {
					@Override
					public void flatMap(String s, Collector<WindowedValue<String>> collector) throws Exception {
						collector.collect(WindowedValue.<String>of(s, Instant.now(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING));
					}
				}).assignTimestampsAndWatermarks(new IngestionTimeExtractor());
	} else {
		source = context.getExecutionEnvironment()
				.addSource(new UnboundedSourceWrapper<>(context.getPipelineOptions(), transform));
	}
	context.setOutputDataStream(output, source);
}
 
Example #3
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
@Override
public void translateNode(TextIO.Read.Bound<String> transform, FlinkBatchTranslationContext context) {
	String path = transform.getFilepattern();
	String name = transform.getName();

	TextIO.CompressionType compressionType = transform.getCompressionType();
	boolean needsValidation = transform.needsValidation();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.CompressionType not yet supported. Is: {}.", compressionType);
	LOG.warn("Translation of TextIO.Read.needsValidation not yet supported. Is: {}.", needsValidation);

	PValue output = context.getOutput(transform);

	TypeInformation<String> typeInformation = context.getTypeInfo(output);
	DataSource<String> source = new DataSource<>(context.getExecutionEnvironment(), new TextInputFormat(new Path(path)), typeInformation, name);

	context.setOutputDataSet(output, source);
}
 
Example #4
Source File: ReadSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}
 
Example #5
Source File: UnboundedSourceITCase.java    From flink-dataflow with Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}
 
Example #6
Source File: CoinbaseSource.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CloudBigtableOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CloudBigtableOptions.class);

  CloudBigtableScanConfiguration config =
      new CloudBigtableScanConfiguration.Builder()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId())
          .build();

  options.setStreaming(true);
  options.setRunner(DataflowPipelineRunner.class);

  Pipeline p = Pipeline.create(options);
  CloudBigtableIO.initializeForWrite(p);

  p.apply(Read.from(new CoinbaseSource()))
      .apply(ParDo.named("DeserializeCoinbase").of(new DeserializeCoinbase()))
      .apply(ParDo.of(new HBaseBigtableWriter()))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run();
}
 
Example #7
Source File: DesiredStateEnforcer.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline, String org,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java Objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Read projects from the CRM API.
  PCollection<GCPProject> allProjects =
      pipeline.apply("Read live projects", Read.from(new LiveProjectSource(org)));
  // Extract project states.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allProjects
          .apply(ParDo.named("Extract project policies").of(new ExtractState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> knownGoodStatesView =
      taggedKnownGoodStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedLiveStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(knownGoodStatesView)
          .of(new FilterOutMatchingState(knownGoodStatesView)));

  // Construct an alert message for all the discrepancies found and fix the discrepancies.
  return mismatchedStates
      .apply(ParDo.named("Fix discrepancies").of(discrepancyAutoFixMessenger));
}
 
Example #8
Source File: OnDemandLiveStateChecker.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PCollection<String> constructPipeline(Pipeline pipeline,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Extract a list of checked-in projects from GCS.
  PCollection<List<String>> allFilePaths = knownGoodFiles
      .apply("Extract just the file paths", ParDo.of(new FilePathFromPair()));
  // Read the live version of the states of the checked-in projects.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allFilePaths.apply(ParDo.named("Get live resource and states from file path")
          .of(new FilePathToLiveState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> liveStatesView =
      taggedLiveStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedKnownGoodStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(liveStatesView)
          .of(new FilterOutMatchingState(liveStatesView)));
  // Construct an alert message for all the discrepancies found.
  return mismatchedStates.apply(ParDo
      .named("Generate notification messages")
      .of(new StateDiscrepancyMessenger()));
}
 
Example #9
Source File: KafkaWindowedWordCountExample.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}
 
Example #10
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}
 
Example #11
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(Read.Bounded<T> transform, FlinkBatchTranslationContext context) {
	String name = transform.getName();
	BoundedSource<T> source = transform.getSource();
	PCollection<T> output = context.getOutput(transform);
	Coder<T> coder = output.getCoder();

	TypeInformation<T> typeInformation = context.getTypeInfo(output);

	DataSource<T> dataSource = new DataSource<>(context.getExecutionEnvironment(),
			new SourceInputFormat<>(source, context.getPipelineOptions()), typeInformation, name);

	context.setOutputDataSet(output, dataSource);
}
 
Example #12
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
		public void translateNode(AvroIO.Read.Bound<T> transform, FlinkBatchTranslationContext context) {
			String path = transform.getFilepattern();
			String name = transform.getName();
//			Schema schema = transform.getSchema();
			PValue output = context.getOutput(transform);

			TypeInformation<T> typeInformation = context.getTypeInfo(output);

			// This is super hacky, but unfortunately we cannot get the type otherwise
			Class<T> extractedAvroType;
			try {
				Field typeField = transform.getClass().getDeclaredField("type");
				typeField.setAccessible(true);
				@SuppressWarnings("unchecked")
				Class<T> avroType = (Class<T>) typeField.get(transform);
				extractedAvroType = avroType;
			} catch (NoSuchFieldException | IllegalAccessException e) {
				// we know that the field is there and it is accessible
				throw new RuntimeException("Could not access type from AvroIO.Bound", e);
			}

			DataSource<T> source = new DataSource<>(context.getExecutionEnvironment(),
					new AvroInputFormat<>(new Path(path), extractedAvroType),
					typeInformation, name);

			context.setOutputDataSet(output, source);
		}
 
Example #13
Source File: UnboundedSourceWrapper.java    From flink-dataflow with Apache License 2.0 4 votes vote down vote up
public UnboundedSourceWrapper(PipelineOptions pipelineOptions, Read.Unbounded<T> transform) {
	this.name = transform.getName();
	this.pipelineOptions = pipelineOptions;
	this.source = transform.getSource();
}