com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory Java Examples

The following examples show how to use com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CoinbaseSource.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CloudBigtableOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CloudBigtableOptions.class);

  CloudBigtableScanConfiguration config =
      new CloudBigtableScanConfiguration.Builder()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId())
          .build();

  options.setStreaming(true);
  options.setRunner(DataflowPipelineRunner.class);

  Pipeline p = Pipeline.create(options);
  CloudBigtableIO.initializeForWrite(p);

  p.apply(Read.from(new CoinbaseSource()))
      .apply(ParDo.named("DeserializeCoinbase").of(new DeserializeCoinbase()))
      .apply(ParDo.of(new HBaseBigtableWriter()))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run();
}
 
Example #2
Source File: LiveProjectSourceTest.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
@Test
public void testBundleSplitIsJustSource() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  List<LiveProjectSource> bundles = source.splitIntoBundles(0, null);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(0, options);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(1, options);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(100000, options);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(10, null);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);
}
 
Example #3
Source File: GCSFilesSourceTest.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
@Test
public void testReaderAdvance() {
  String objectName = REPOSITORY + this.source.getDirDelimiter() + "sampleProject";
  PipelineOptions options = PipelineOptionsFactory.create();
  BoundedReader<KV<List<String>, String>> reader;

  try {
    setUpGetFilesPage(objectName, 0);
    reader = this.source.createReader(options);
    assertFalse(reader.start());

    setUpGetFilesPage(objectName, 1);
    reader = this.source.createReader(options);
    assertTrue(reader.start());
    assertFalse(reader.advance());

    setUpGetFilesPage(objectName, 2);
    reader = this.source.createReader(options);
    assertTrue(reader.start());
    assertTrue(reader.advance());
    assertFalse(reader.advance());
  } catch (IOException e) {
    fail();
  }
}
 
Example #4
Source File: GCSFilesSourceTest.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
@Test
public void testReaderGetCurrent() {
  String projectName = "sampleProject";
  String objectName = REPOSITORY + this.source.getDirDelimiter() + projectName;
  String fileContent = "sample file content";
  ByteArrayOutputStream[] out = new ByteArrayOutputStream[1];
  PipelineOptions options = PipelineOptionsFactory.create();

  setUpGetFilesPage(objectName);
  setUpGetFileContent(fileContent, out);

  try {
    BoundedReader<KV<List<String>, String>> reader = this.source.createReader(options);
    reader.start();
    KV<List<String>, String> value = reader.getCurrent();
    assertEquals(value.getKey().size(), 2);
    assertEquals(value.getKey().get(0), REPOSITORY);
    assertEquals(value.getKey().get(1), projectName);
    assertEquals(value.getValue(), fileContent);
  } catch (IOException e) {
    fail();
  }
}
 
Example #5
Source File: LiveProjectSourceTest.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
@Test
public void testAdvanceWithoutStart() {
  PipelineOptions options = PipelineOptionsFactory.create();
  LiveProjectReader reader;

  this.listProjectsResponse.setProjects(new ArrayList<Project>(0));
  this.listProjectsResponse.setNextPageToken(null);
  try {
    reader = (LiveProjectReader) this.source.createReader(options);
    assertFalse(reader.advance());
    assertNull(reader.getNextPageToken());
    assertTrue(reader.getProjects().isEmpty());
    reader.getCurrent();
  } catch (IOException e) {
    fail("IOException in reader.start");
  } catch (NoSuchElementException ignored) {
    // test passed.
  }
}
 
Example #6
Source File: GCSFilesSourceTest.java    From policyscanner with Apache License 2.0 6 votes vote down vote up
@Test
public void testBundleSplitIsJustSource() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  List<GCSFilesSource> bundles = source.splitIntoBundles(0, null);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(0, options);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(1, options);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(100000, options);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);

  bundles = source.splitIntoBundles(10, null);
  assertEquals(bundles.size(), 1);
  assertEquals(bundles.get(0), source);
}
 
Example #7
Source File: DataflowFactory.java    From dockerflow with Apache License 2.0 6 votes vote down vote up
/**
 * Create Dataflow Pipeline options from the standard command-line options, "--project=",
 * "--runner=" and "--stagingLocation="
 *
 * @param args
 * @return
 * @throws IOException
 */
public static DataflowPipelineOptions pipelineOptions(String[] args) throws IOException {
  LOG.info("Set up Dataflow options");
  DataflowPipelineOptions o = PipelineOptionsFactory.as(DataflowPipelineOptions.class);

  Map<String, String> m = StringUtils.parseArgs(args);
  o.setProject(m.get(PROJECT));
  if (m.containsKey(STAGING)) {
    o.setStagingLocation(m.get(STAGING));
  } else if (m.containsKey(STAGING_LOCATION)) {
    o.setStagingLocation(m.get(STAGING_LOCATION));
  } else if (m.containsKey(WORKSPACE)) {
    o.setStagingLocation(m.get(WORKSPACE) + "/staging");
  }
  o.setRunner(runner(m.get(RUNNER)));
  o.setMaxNumWorkers(m.get(MAX_WORKERS) == null ? 1 : Integer.parseInt(m.get(MAX_WORKERS)));
  if (m.containsKey(MACHINE_TYPE)) {
    o.setWorkerMachineType(m.get(MACHINE_TYPE));
  } else {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }
  return o;
}
 
Example #8
Source File: FilterRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("filter lower Manhattan", ParDo.of(new FilterLowerManhattan()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #9
Source File: LoadBooks.java    From cloud-bigtable-examples with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}
 
Example #10
Source File: CountRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("window 1s", Window.into(FixedWindows.of(Duration.standardSeconds(1))))
   .apply("mark rides", MapElements.via(new MarkRides()))
   .apply("count similar", Count.perKey())
   .apply("format rides", MapElements.via(new TransformRides()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));

  p.run();
}
 
Example #11
Source File: AutoComplete.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);
  options.setCheckpointingInterval(1000L);
  options.setNumberOfExecutionRetries(5);
  options.setExecutionRetryDelay(3000L);
  options.setRunner(FlinkPipelineRunner.class);

  PTransform<? super PBegin, PCollection<String>> readSource =
          Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("WordStream");
  WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

  // Create the pipeline.
  Pipeline p = Pipeline.create(options);
  PCollection<KV<String, List<CompletionCandidate>>> toWrite = p
    .apply(readSource)
    .apply(ParDo.of(new ExtractWordsFn()))
    .apply(Window.<String>into(windowFn)
            .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
          .discardingFiredPanes())
    .apply(ComputeTopCompletions.top(10, options.getRecursive()));

  toWrite
    .apply(ParDo.named("FormatForPerTaskFile").of(new FormatForPerTaskLocalFile()))
    .apply(TextIO.Write.to("./outputAutoComplete.txt"));

  p.run();
}
 
Example #12
Source File: WindowedWordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
	StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(StreamingWordCountOptions.class);
	options.setStreaming(true);
	options.setWindowSize(10L);
	options.setSlide(5L);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize() +
			" sec. and a slide of " + options.getSlide());

	Pipeline pipeline = Pipeline.create(options);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(SlidingWindows.of(Duration.standardSeconds(options.getWindowSize()))
					.every(Duration.standardSeconds(options.getSlide())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputWordCount.txt"));

	pipeline.run();
}
 
Example #13
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}
 
Example #14
Source File: KafkaWindowedWordCountExample.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}
 
Example #15
Source File: WordCount.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {

		Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
				.as(Options.class);
		options.setRunner(FlinkPipelineRunner.class);

		Pipeline p = Pipeline.create(options);

		p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
				.apply(new CountWords())
				.apply(MapElements.via(new FormatAsTextFn()))
				.apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));

		p.run();
	}
 
Example #16
Source File: TFIDF.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

	options.setRunner(FlinkPipelineRunner.class);

	Pipeline pipeline = Pipeline.create(options);
	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	pipeline
			.apply(new ReadDocuments(listInputDocuments(options)))
			.apply(new ComputeTfIdf())
			.apply(new WriteTfIdf(options.getOutput()));

	pipeline.run();
}
 
Example #17
Source File: FlinkPipelineRunner.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a runner with default properties for testing.
 *
 * @return The newly created runner.
 */
public static FlinkPipelineRunner createForTest(boolean streaming) {
	FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
	// we use [auto] for testing since this will make it pick up the Testing
	// ExecutionEnvironment
	options.setFlinkMaster("[auto]");
	options.setStreaming(streaming);
	return new FlinkPipelineRunner(options);
}
 
Example #18
Source File: LiveProjectSourceTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testAdvanceWhenPageTokenNull() {
  String projectName = "sampleProjectName";
  String projectId = "sampleProjectId";
  String orgId = ORG;
  ResourceId resourceId = new ResourceId().setId(orgId);
  GCPProject gcpProject = new GCPProject(projectId, orgId, projectName);
  Project project =
      new Project()
          .setProjectId(projectId)
          .setParent(resourceId)
          .setName(projectName)
          .setLifecycleState("ACTIVE");
  List<Project> projects = Arrays.asList(project);
  PipelineOptions options = PipelineOptionsFactory.create();
  LiveProjectReader reader;

  this.listProjectsResponse.setProjects(projects);
  this.listProjectsResponse.setNextPageToken(null);
  try {
    reader = (LiveProjectReader) this.source.createReader(options);
    assertTrue(reader.start());
    assertEquals(reader.getNextPageToken(), null);
    assertEquals(reader.getCurrent(), gcpProject);
    assertFalse(reader.advance());
    reader.getCurrent();
    fail("No exception when reading from empty source");
  } catch (IOException e) {
    fail("IOException in reader.start");
  } catch (NoSuchElementException ignored) {
    // test passed.
  }
}
 
Example #19
Source File: JoinExamples.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}
 
Example #20
Source File: ExactDollarRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("extract dollars",
      MapElements.via((TableRow x) -> Double.parseDouble(x.get("meter_increment").toString()))
        .withOutputType(TypeDescriptor.of(Double.class)))

   .apply("fixed window", Window.into(FixedWindows.of(Duration.standardMinutes(1))))
   .apply("trigger",
      Window.<Double>triggering(
        AfterWatermark.pastEndOfWindow()
          .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(1)))
          .withLateFirings(AfterPane.elementCountAtLeast(1)))
        .accumulatingFiredPanes()
        .withAllowedLateness(Duration.standardMinutes(5)))

   .apply("sum whole window", Sum.doublesGlobally().withoutDefaults())
   .apply("format rides", ParDo.of(new TransformRides()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #21
Source File: DesiredStateEnforcerApp.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #22
Source File: TimestampRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("window 1s", Window.into(FixedWindows.of(Duration.standardSeconds(1))))

   .apply("parse timestamps",
      MapElements.via(
        (TableRow e) ->
          Instant.from(DateTimeFormatter.ISO_DATE_TIME.parse(e.get("timestamp").toString())).toEpochMilli())
      .withOutputType(TypeDescriptor.of(Long.class)))

   .apply("max timestamp in window", Max.longsGlobally().withoutDefaults())

   .apply("transform",
      MapElements.via(
        (Long t) -> {
          TableRow ride = new TableRow();
          ride.set("timestamp", Instant.ofEpochMilli(t).toString());
          return ride;
        })
      .withOutputType(TypeDescriptor.of(TableRow.class)))

   .apply(PubsubIO.Write.named("write to PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #23
Source File: DebugFewRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("filter a few rides",
      Filter.byPredicate(
        (TableRow t) -> {
          String rideId = t.get("ride_id").toString();

          // You can change the filter here to allow more or fewer rides through:
          // rideIds starting with "a" are quite common
          // rideIds starting with "ab" are rarer
          // rideIds starting with "abc" are rarer still
          if (rideId.startsWith("ab")) {
            LOG.info("Accepted point on ride {} with order number {}} timestamp {}",
              t.get("ride_id"), t.get("point_idx"), t.get("timestamp"));
            return true;
          }
          return false;
        }))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));

  p.run();
}
 
Example #24
Source File: LatestRides.java    From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("key rides by rideid",
      MapElements.via((TableRow ride) -> KV.of(ride.get("ride_id").toString(), ride))
        .withOutputType(new TypeDescriptor<KV<String, TableRow>>() {}))

   .apply("session windows on rides with early firings",
      Window.<KV<String, TableRow>>into(
        Sessions.withGapDuration(Duration.standardMinutes(60)))
          .triggering(
            AfterWatermark.pastEndOfWindow()
              .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(2000))))
          .accumulatingFiredPanes()
          .withAllowedLateness(Duration.ZERO))

   .apply("group ride points on same ride", Combine.perKey(new LatestPointCombine()))

   .apply("discard key",
      MapElements.via((KV<String, TableRow> a) -> a.getValue())
        .withOutputType(TypeDescriptor.of(TableRow.class)))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}
 
Example #25
Source File: LiveStateCheckerApp.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(Constants.PROJECT_ID);
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #26
Source File: UserManagedKeysApp.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #27
Source File: LiveStateCheckerRunner.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
private static PipelineOptions getCloudExecutionOptions(String stagingLocation) {
  DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
  options.setProject(SystemProperty.applicationId.get());
  options.setStagingLocation(stagingLocation);
  options.setRunner(BlockingDataflowPipelineRunner.class);
  return options;
}
 
Example #28
Source File: GCSFilesSourceTest.java    From policyscanner with Apache License 2.0 5 votes vote down vote up
@Test
public void testReaderStart() {
  String objectName = REPOSITORY + this.source.getDirDelimiter() + "sampleProject";
  PipelineOptions options = PipelineOptionsFactory.create();
  setUpGetFilesPage(objectName);
  try {
    assertTrue(this.source.createReader(options).start());
  } catch (IOException e) {
    fail();
  }
}
 
Example #29
Source File: LiveStateCheckerTest.java    From policyscanner with Apache License 2.0 4 votes vote down vote up
@Test
public void testUnmatchedStatesOutputIsCorrect() throws IOException {
  // create the policy for the live project
  String editorRole = "roles/editor";
  String editorMember = "serviceAccount:[email protected]";
  String ownerRole = "roles/owner";
  String ownerMember = "user:[email protected]";
  String fileContent = "[\n"
      + "      {\n"
      + "        \"role\": \"" + ownerRole + "\",\n"
      + "        \"members\": [\n"
      + "          \"" + ownerMember + "\"\n"
      + "        ]\n"
      + "      },\n"
      + "      {\n"
      + "        \"role\": \"" + editorRole + "\",\n"
      + "        \"members\": [\n"
      + "          \"" + editorMember + "\"\n"
      + "        ]\n"
      + "      }\n"
      + "    ]";
  String liveProjectName = "someLiveProjectName";
  String liveProjectId = "someLiveProjectId";
  String orgId = ORG_ID;
  ResourceId resourceId = new ResourceId().setId(orgId);
  Project liveProject =
      new Project()
          .setProjectId(liveProjectId)
          .setParent(resourceId)
          .setName(liveProjectName)
          .setLifecycleState("ACTIVE");
  Binding editorBinding = new Binding()
      .setRole(editorRole)
      .setMembers(Arrays.asList(editorMember));
  Binding ownerBinding = new Binding()
      .setRole(ownerRole)
      .setMembers(Arrays.asList(ownerMember));
  List<Binding> bindings = Arrays.asList(ownerBinding, editorBinding);
  Policy iamPolicy = new Policy().setBindings(bindings);
  // when calling projects().list(), return the live project
  when(listProjects.execute())
  .thenReturn(this.listProjectsResponse
      .setNextPageToken("halting string")
      .setProjects(Arrays.asList(liveProject)));
  when(this.getIamPolicy.execute()).thenReturn(iamPolicy);

  // mock out the desired policy
  String desiredProjectId = "someKnownGoodProject";
  String desiredPolicyPath = ORG_ID + DELIM + desiredProjectId + DELIM + POLICY_FILE;

  setUpGetFileContent(fileContent);
  setUpGetFilesPage(desiredPolicyPath);

  PipelineOptions options = PipelineOptionsFactory.create();

  LiveStateChecker liveStateChecker =
      new LiveStateChecker(options, this.checkedSource, ORG_ID)
        .build();

  String[] expectedOutput = new String[] {
      "DESIRED:someKnownGoodProject",
      "LIVE:someLiveProjectId"
  };

  DataflowAssert
      .that(liveStateChecker.getUnmatchedStatesOutput())
      .containsInAnyOrder(expectedOutput);

  liveStateChecker.run();
}
 
Example #30
Source File: DesiredStateEnforcerApp.java    From policyscanner with Apache License 2.0 4 votes vote down vote up
private PipelineOptions getLocalExecutionOptions() {
  return PipelineOptionsFactory.create();
}