org.apache.beam.sdk.transforms.Sample Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Sample. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: BigQueryDatasetRuntime.java From components with Apache License 2.0

6 votes

public void getSampleDeprecated(int limit, Consumer<IndexedRecord> consumer) {
    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    // Create an input runtime based on the properties.
    BigQueryInputRuntime inputRuntime = new BigQueryInputRuntime();
    BigQueryInputProperties inputProperties = new BigQueryInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(new BeamJobRuntimeContainer(options), inputProperties);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        PipelineResult pr = p.run();
        pr.waitUntilFinish();
    }
}

Example #2

Source File: ElasticsearchDatasetRuntime.java From components with Apache License 2.0

6 votes

@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties: ensure to read only the first batch of documents
    // from the index since we're computing a sample
    ElasticsearchInputRuntime inputRuntime = new ElasticsearchInputRuntime(true);
    ElasticsearchInputProperties inputProperties = new ElasticsearchInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}

Example #3

Source File: GroupTest.java From beam with Apache License 2.0

5 votes

@Test
@Category(NeedsRunner.class)
public void testAggregateLogicalValuesGlobally() {
  Collection<BasicEnum> elements =
      Lists.newArrayList(
          BasicEnum.of("a", BasicEnum.Test.ONE), BasicEnum.of("a", BasicEnum.Test.TWO));

  CombineFn<EnumerationType.Value, ?, Iterable<EnumerationType.Value>> sampleAnyCombineFn =
      Sample.anyCombineFn(100);
  Field aggField =
      Field.of("sampleList", FieldType.array(FieldType.logicalType(BASIC_ENUM_ENUMERATION)));
  pipeline
      .apply(Create.of(elements))
      .apply(
          Group.<BasicEnum>globally().aggregateField("enumeration", sampleAnyCombineFn, aggField))
      .apply(
          ParDo.of(
              new DoFn<Row, List<Integer>>() {
                @ProcessElement
                // TODO: List<enum> doesn't get converted properly by ConvertHelpers, so the
                // following line does
                // not work. TO fix this we need to move logical-type conversion out of
                // RowWithGetters and into
                // the actual getters.
                //    public void process(@FieldAccess("sampleList") List<BasicEnum.Test> values)
                // {
                public void process(@Element Row value) {
                  assertThat(
                      value.getArray(0),
                      containsInAnyOrder(
                          BASIC_ENUM_ENUMERATION.valueOf(1), BASIC_ENUM_ENUMERATION.valueOf(2)));
                }
              }));

  pipeline.run();
}

Example #4

Source File: PubSubDatasetRuntime.java From components with Apache License 2.0

5 votes

@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Because PubSub do not have offset, and the message will be deleted after
    // read, so have to create a dumy reader which do not call ack after read

    // Create an input runtime based on the properties.
    PubSubInputRuntime inputRuntime = new PubSubInputRuntime();
    PubSubInputProperties inputProperties = new PubSubInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputProperties.useMaxNumRecords.setValue(true);
    inputProperties.maxNumRecords.setValue(limit);
    inputProperties.useMaxReadTime.setValue(true);
    // 10s, the value is better to depends on ack deadline for small dataset
    inputProperties.maxReadTime.setValue(10000l);
    inputProperties.noACK.setValue(true);
    inputRuntime.initialize(null, inputProperties);

    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}

Example #5

Source File: KafkaDatasetRuntime.java From components with Apache License 2.0

5 votes

/**
 * @param limit the maximum number of records to return.
 * @param consumer a callback that will be applied to each sampled record. This callback should throw a
 * {@link org.talend.daikon.exception.TalendRuntimeException} if there was an error processing the record. Kafka is
 * a unbounded source, have to set time out to stop reading, 1 second as the time out for get Sample, no matter if
 * it get sample or not.
 */
@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
    KafkaInputProperties inputProperties = new KafkaInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(dataset);
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(1000l);
    inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
    // TODO: BEAM-1847: Enable both stopping conditions when they can be set, and remove Sample transform from job.
    // inputProperties.useMaxNumRecords.setValue(true);
    // inputProperties.maxNumRecords.setValue(Long.valueOf(limit));
    inputRuntime.initialize(null, inputProperties);

    // Create a pipeline using the input component to get records.
    PipelineOptions options = PipelineOptionsFactory.create();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p.apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit)).apply(collector);
        p.run().waitUntilFinish();
    }
}

Example #6

Source File: KinesisDatasetRuntime.java From components with Apache License 2.0

5 votes

@Override
public void getSample(int limit, Consumer<IndexedRecord> consumer) {
    // Create an input runtime based on the properties.
    KinesisInputRuntime inputRuntime = new KinesisInputRuntime();
    KinesisInputProperties inputProperties = new KinesisInputProperties(null);
    inputProperties.init();
    inputProperties.setDatasetProperties(properties);
    inputProperties.useMaxNumRecords.setValue(true);
    inputProperties.maxNumRecords.setValue(limit);
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(10000l);
    inputProperties.position.setValue(KinesisInputProperties.OffsetType.EARLIEST);
    inputRuntime.initialize(null, inputProperties);

    // Create a pipeline using the input component to get records.
    DirectOptions options = BeamLocalRunnerOption.getOptions();
    final Pipeline p = Pipeline.create(options);

    try (DirectConsumerCollector<IndexedRecord> collector = DirectConsumerCollector.of(consumer)) {
        // Collect a sample of the input records.
        p
                .apply(inputRuntime) //
                .apply(Sample.<IndexedRecord> any(limit))
                .apply(collector);
        p.run().waitUntilFinish();
    }
}

Example #7

Source File: CsvConverters.java From DataflowTemplates with Apache License 2.0

4 votes

@Override
public PCollectionTuple expand(PCollectionTuple lines) {

  PCollectionView<String> headersView = null;

  // Convert csv lines into Failsafe elements so that we can recover over multiple transforms.
  PCollection<FailsafeElement<String, String>> lineFailsafeElements =
      lines
          .get(lineTag())
          .apply("LineToFailsafeElement", ParDo.of(new LineToFailsafeElementFn()));

  // If UDF is specified then use that to parse csv lines.
  if (udfFileSystemPath() != null) {

    return lineFailsafeElements.apply(
        "LineToDocumentUsingUdf",
        FailsafeJavascriptUdf.<String>newBuilder()
            .setFileSystemPath(udfFileSystemPath())
            .setFunctionName(udfFunctionName())
            .setSuccessTag(udfOutputTag())
            .setFailureTag(udfDeadletterTag())
            .build());
  }

  // If no udf then use json schema
  if (jsonSchemaPath() != null) {

    String schema = SchemaUtils.getGcsFileAsString(jsonSchemaPath());

    return lineFailsafeElements.apply(
        "LineToDocumentUsingSchema",
        ParDo.of(
                new FailsafeElementToJsonFn(
                    headersView, schema, delimiter(), udfDeadletterTag()))
            .withOutputTags(udfOutputTag(), TupleTagList.of(udfDeadletterTag())));
  }

  // Run if using headers
  headersView = lines.get(headerTag()).apply(Sample.any(1)).apply(View.asSingleton());

  PCollectionView<String> finalHeadersView = headersView;
  lines
      .get(headerTag())
      .apply(
          "CheckHeaderConsistency",
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                      String headers = c.sideInput(finalHeadersView);
                      if (!c.element().equals(headers)) {
                        LOG.error("Headers do not match, consistency cannot be guaranteed");
                        throw new RuntimeException(
                            "Headers do not match, consistency cannot be guaranteed");
                      }
                    }
                  })
              .withSideInputs(finalHeadersView));

  return lineFailsafeElements.apply(
      "LineToDocumentWithHeaders",
      ParDo.of(
              new FailsafeElementToJsonFn(
                  headersView, jsonSchemaPath(), delimiter(), udfDeadletterTag()))
          .withSideInputs(headersView)
          .withOutputTags(udfOutputTag(), TupleTagList.of(udfDeadletterTag())));
}