Java Code Examples for org.apache.beam.sdk.values.PBegin#apply()

The following examples show how to use org.apache.beam.sdk.values.PBegin#apply() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JdbcIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PBegin input) {
  checkArgument(getQuery() != null, "withQuery() is required");
  checkArgument(
      (getDataSourceProviderFn() != null),
      "withDataSourceConfiguration() or withDataSourceProviderFn() is required");

  Schema schema = inferBeamSchema();
  PCollection<Row> rows =
      input.apply(
          JdbcIO.<Row>read()
              .withDataSourceProviderFn(getDataSourceProviderFn())
              .withQuery(getQuery())
              .withCoder(RowCoder.of(schema))
              .withRowMapper(SchemaUtil.BeamRowMapper.of(schema))
              .withFetchSize(getFetchSize())
              .withOutputParallelization(getOutputParallelization())
              .withStatementPreparator(getStatementPreparator()));
  rows.setRowSchema(schema);
  return rows;
}
 
Example 2
Source File: AvroIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  checkNotNull(getFilepattern(), "filepattern");
  Coder<T> coder = inferCoder(getCoder(), getParseFn(), input.getPipeline().getCoderRegistry());

  if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) {
    return input.apply(
        org.apache.beam.sdk.io.Read.from(
            AvroSource.from(getFilepattern()).withParseFn(getParseFn(), coder)));
  }

  // All other cases go through FileIO + ParseFilesGenericRecords.
  return input
      .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of()))
      .apply("Match All", FileIO.matchAll().withConfiguration(getMatchConfiguration()))
      .apply(
          "Read Matches",
          FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT))
      .apply("Via ParseFiles", parseFilesGenericRecords(getParseFn()).withCoder(coder));
}
 
Example 3
Source File: HCatalogIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public PCollection<HCatRecord> expand(PBegin input) {
  checkArgument(getTable() != null, "withTable() is required");
  checkArgument(getConfigProperties() != null, "withConfigProperties() is required");
  Watch.Growth<Read, Integer, Integer> growthFn;
  if (getPollingInterval() != null) {
    growthFn = Watch.growthOf(new PartitionPollerFn()).withPollInterval(getPollingInterval());
    if (getTerminationCondition() != null) {
      growthFn = growthFn.withTerminationPerInput(getTerminationCondition());
    }
    return input
        .apply("ConvertToReadRequest", Create.of(this))
        .apply("WatchForNewPartitions", growthFn)
        .apply("PartitionReader", ParDo.of(new PartitionReaderFn(getConfigProperties())));
  } else {
    // Treat as Bounded
    checkArgument(
        getTerminationCondition() == null,
        "withTerminationCondition() is not required when using in bounded reads mode");
    return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedHCatalogSource(this)));
  }
}
 
Example 4
Source File: TFRecordIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<byte[]> expand(PBegin input) {
  if (getFilepattern() == null) {
    throw new IllegalStateException(
        "Need to set the filepattern of a TFRecordIO.Read transform");
  }

  if (getValidate()) {
    checkState(getFilepattern().isAccessible(), "Cannot validate with a RVP.");
    try {
      MatchResult matches = FileSystems.match(getFilepattern().get());
      checkState(
          !matches.metadata().isEmpty(),
          "Unable to find any files matching %s",
          getFilepattern().get());
    } catch (IOException e) {
      throw new IllegalStateException(
          String.format("Failed to validate %s", getFilepattern().get()), e);
    }
  }

  return input.apply("Read", org.apache.beam.sdk.io.Read.from(getSource()));
}
 
Example 5
Source File: KinesisIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KinesisRecord> expand(PBegin input) {
  Unbounded<KinesisRecord> unbounded =
      org.apache.beam.sdk.io.Read.from(
          new KinesisSource(
              getAWSClientsProvider(),
              getStreamName(),
              getInitialPosition(),
              getUpToDateThreshold(),
              getWatermarkPolicyFactory(),
              getRateLimitPolicyFactory(),
              getRequestRecordsLimit(),
              getMaxCapacityPerShard()));

  PTransform<PBegin, PCollection<KinesisRecord>> transform = unbounded;

  if (getMaxNumRecords() < Long.MAX_VALUE || getMaxReadTime() != null) {
    transform =
        unbounded.withMaxReadTime(getMaxReadTime()).withMaxNumRecords(getMaxNumRecords());
  }

  return input.apply(transform);
}
 
Example 6
Source File: CompareDatabases.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Long> expand(PBegin begin) {

  final TupleTag<Struct> oneTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsOne = begin.apply("Read one", new ReadAllRows(one));
  final TupleTag<Struct> twoTag = new TupleTag<>();
  PCollection<KV<String, Struct>> rowsTwo = begin.apply("Read two", new ReadAllRows(two));

  PCollection<KV<String, CoGbkResult>> cogroup =
      KeyedPCollectionTuple.of(oneTag, rowsOne).and(twoTag, rowsTwo).apply(CoGroupByKey.create());

  PCollection<String> fails =
      cogroup.apply(
          ParDo.of(
              new DoFn<KV<String, CoGbkResult>, String>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  KV<String, CoGbkResult> element = c.element();
                  CoGbkResult gbk = element.getValue();
                  ArrayList<Struct> oneRows = Lists.newArrayList(gbk.getAll(oneTag));
                  ArrayList<Struct> twoRows = Lists.newArrayList(gbk.getAll(twoTag));

                  if (oneRows.size() != 1 || twoRows.size() != 1) {
                    c.output(element.getKey());
                    return;
                  }

                  Struct sOne = oneRows.get(0);
                  Struct sTwo = twoRows.get(0);

                  if (!sOne.equals(sTwo)) {
                    c.output(element.getKey());
                  }
                }
              }));

  return fails.apply(Count.globally());
}
 
Example 7
Source File: FixedInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin begin) {
    FixedDatasetRuntime runtime = new FixedDatasetRuntime();
    runtime.initialize(null, properties.getDatasetProperties());

    // The values to include in the PCollection
    List<IndexedRecord> values = new LinkedList<>();

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.NONE
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND) {
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.REPLACE) {
        properties.getDatasetProperties().values.setValue(properties.overrideValues.getValue());
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (values.size() != 0) {
        PCollection<IndexedRecord> out = (PCollection<IndexedRecord>) begin
                .apply(Create.of(values).withCoder((AvroCoder) AvroCoder.of(runtime.getSchema())));
        if (properties.repeat.getValue() > 1) {
            PCollectionList<IndexedRecord> merged = PCollectionList.of(out);
            for (int i = 2; i < properties.repeat.getValue(); i++)
                merged = merged.and(out);
            out = merged.apply(Flatten.<IndexedRecord> pCollections());
        }
        return out;
    } else {
        return begin.apply(RowGeneratorIO.read().withSchema(runtime.getSchema()) //
                .withSeed(0L) //
                .withPartitions(1) //
                .withRows(properties.repeat.getValue()));
    }
}
 
Example 8
Source File: HCatalogTable.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> buildIOReader(PBegin begin) {
  return begin.apply(
      "HCatalog-Read-" + database() + "-" + table(),
      HCatToRow.fromSpec(
          HCatalogIO.read()
              .withConfigProperties(config())
              .withDatabase(database())
              .withTable(table())));
}
 
Example 9
Source File: KinesisInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin in) {
    KinesisIO.Read kinesisRead = KinesisIO
            .read()
            .withStreamName(dataset.streamName.getValue())
            .withInitialPositionInStream(convertToPosition(properties.position.getValue()))
            .withAWSClientsProvider(KinesisClient.getProvider(dataset));
    if (properties.useMaxReadTime.getValue()) {
        kinesisRead = kinesisRead.withMaxReadTime(new Duration(properties.maxReadTime.getValue()));
    }
    if (properties.useMaxNumRecords.getValue()) {
        kinesisRead = kinesisRead.withMaxNumRecords(properties.maxNumRecords.getValue());
    }
    PCollection<KinesisRecord> kinesisRecordPCollection = in.apply(kinesisRead);

    switch (dataset.valueFormat.getValue()) {
    case AVRO: {
        Schema schema = new Schema.Parser().parse(dataset.avroSchema.getValue());
        return kinesisRecordPCollection.apply(ParDo.of(new AvroConverter(schema.toString()))).setCoder(
                getDefaultOutputCoder());
    }
    case CSV: {
        return kinesisRecordPCollection.apply(ParDo.of(new CsvConverter(dataset.getFieldDelimiter()))).setCoder(
                getDefaultOutputCoder());
    }
    default:
        throw new RuntimeException("To be implemented: " + dataset.valueFormat.getValue());
    }
}
 
Example 10
Source File: TestTableProvider.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> buildIOReader(PBegin begin) {
  TableWithRows tableWithRows =
      GLOBAL_TABLES
          .get(this.tableWithRows.tableProviderInstanceId)
          .get(this.tableWithRows.table.getName());
  return begin.apply(Create.of(tableWithRows.rows).withRowSchema(getSchema()));
}
 
Example 11
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public PCollection<T> expand(PBegin input) {
  checkNotNull(getFilepattern(), "filepattern");
  checkNotNull(getSchema(), "schema");

  if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) {
    PCollection<T> read =
        input.apply(
            "Read",
            org.apache.beam.sdk.io.Read.from(
                createSource(
                    getFilepattern(),
                    getMatchConfiguration().getEmptyMatchTreatment(),
                    getRecordClass(),
                    getSchema(),
                    null)));
    return getInferBeamSchema() ? setBeamSchema(read, getRecordClass(), getSchema()) : read;
  }

  // All other cases go through FileIO + ReadFiles
  ReadFiles<T> readFiles =
      (getRecordClass() == GenericRecord.class)
          ? (ReadFiles<T>) readFilesGenericRecords(getSchema())
          : readFiles(getRecordClass());
  return input
      .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of()))
      .apply("Match All", FileIO.matchAll().withConfiguration(getMatchConfiguration()))
      .apply(
          "Read Matches",
          FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT))
      .apply("Via ReadFiles", readFiles);
}
 
Example 12
Source File: DirectRunnerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
PTransform<PBegin, PCollection<T>> read() {
  return new PTransform<PBegin, PCollection<T>>() {
    @Override
    public PCollection<T> expand(PBegin input) {
      return input.apply("readFrom:" + name, Read.from(asSource()));
    }
  };
}
 
Example 13
Source File: BigQueryConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<TableRow> expand(PBegin pipeline) {

  if (options().getQuery() == null) {
    LOG.info("No query provided, reading directly from: " + options().getInputTableSpec());
    return pipeline.apply(
        "ReadFromBigQuery",
        BigQueryIO.readTableRows()
            .from(options().getInputTableSpec())
            .withTemplateCompatibility()
            .withMethod(Method.DIRECT_READ)
            .withCoder(TableRowJsonCoder.of()));

  } else {
    LOG.info("Using query: " + options().getQuery());

    if (!options().getUseLegacySql()) {

      LOG.info("Using Standard SQL");
      return pipeline.apply(
          "ReadFromBigQueryWithQuery",
          BigQueryIO.readTableRows()
              .fromQuery(options().getQuery())
              .withTemplateCompatibility()
              .usingStandardSql()
              .withCoder(TableRowJsonCoder.of()));
    } else {

      LOG.info("Using Legacy SQL");
      return pipeline.apply(
          "ReadFromBigQueryWithQuery",
          BigQueryIO.readTableRows()
              .fromQuery(options().getQuery())
              .withTemplateCompatibility()
              .withCoder(TableRowJsonCoder.of()));
    }
  }
}
 
Example 14
Source File: RowGeneratorRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin begin) {
    RowGeneratorIO.Read read = RowGeneratorIO.read();
    read = read.withSchema(properties.schemaFlow.schema.getValue()) //
            .withPartitions(properties.nbPartitions.getValue()) //
            .withRows(properties.nbRows.getValue());

    if (properties.useSeed.getValue()) {
        read = read.withSeed(properties.seed.getValue());
    }

    // TODO(rskraba): partition skew

    return begin.apply(read);
}
 
Example 15
Source File: FixedFlowInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<IndexedRecord> expand(PBegin begin) {
    return begin.apply(Read.from(new FixedFlowInputBoundedSource() //
            .withSchema(properties.schemaFlow.schema.getValue())//
            .withValues(properties.values.getValue()) //
            .withNbRows(properties.nbRows.getValue())));
}
 
Example 16
Source File: CompareDatabases.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<KV<String, Struct>> expand(PBegin begin) {
  PCollectionView<Transaction> tx =
      begin.apply(LocalSpannerIO.createTransaction().withSpannerConfig(spanConfig));

  PCollection<Ddl> sourceDdl =
      begin.apply("Read Information Schema", new ReadInformationSchema(spanConfig, tx));

  final PCollectionView<Ddl> ddlView = sourceDdl.apply(View.asSingleton());

  PCollection<ReadOperation> tables = sourceDdl.apply(new BuildReadFromTableOperations());

  PCollection<Struct> rows =
      tables.apply("Read rows from tables", LocalSpannerIO.readAll().withTransaction(tx).withSpannerConfig(spanConfig));

  return rows.apply(
      ParDo.of(
              new DoFn<Struct, KV<String, Struct>>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                  Ddl ddl = c.sideInput(ddlView);
                  Struct struct = c.element();
                  String tableName = struct.getString(0);

                  Table table = ddl.table(tableName);
                  String key = tableName;
                  for (IndexColumn pk : table.primaryKeys()) {
                    Type columnType = struct.getColumnType(pk.name());
                    if (struct.isNull(pk.name())) {
                      key += "-NULL-";
                      continue;
                    }
                    switch (columnType.getCode()) {
                      case BOOL:
                        key += struct.getBoolean(pk.name());
                        break;
                      case INT64:
                        key += struct.getLong(pk.name());
                        break;
                      case STRING:
                        key += struct.getString(pk.name());
                        break;
                      case BYTES:
                        key +=
                            Base64.getEncoder()
                                .encodeToString(struct.getBytes(pk.name()).toByteArray());
                        break;
                      case FLOAT64:
                        key += struct.getDouble(pk.name());
                        break;
                      case TIMESTAMP:
                        key += struct.getTimestamp(pk.name());
                        break;
                      case DATE:
                        key += struct.getDate(pk.name());
                        break;
                      default:
                        throw new IllegalArgumentException("Unsupported PK type " + columnType);
                    }
                  }
                  c.output(KV.of(key, struct));
                }
              })
          .withSideInputs(ddlView));
}
 
Example 17
Source File: ParquetConverters.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<GenericRecord> expand(PBegin input) {
  return input.apply(
      "ReadParquetFile",
      ParquetIO.read(SchemaUtils.getAvroSchema(schema())).from(inputFileSpec()));
}
 
Example 18
Source File: Write.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PBegin input) {
  return input.apply(Create.empty(PubsubMessageWithAttributesCoder.of()));
}
 
Example 19
Source File: BeamKafkaInputTransform.java    From kettle-beam with Apache License 2.0 4 votes vote down vote up
@Override public PCollection<KettleRow> expand( PBegin input ) {
  try {
    // Only initialize once on this node/vm
    //
    BeamKettle.init( stepPluginClasses, xpPluginClasses );

    // What's the list of topics?
    //
    List<String> topicList = new ArrayList<>();
    for ( String topic : topics.split( "," ) ) {
      topicList.add( Const.trim( topic ) );
    }

    // TODO: add custom configuration options to this map:
    Map<String, Object> consumerConfigUpdates = new HashMap<>(  );
    consumerConfigUpdates.put( "group.id", groupId );
    for (ConfigOption configOption : configOptions) {
      Object value;
      String optionValue = configOption.getValue();
      switch(configOption.getType()) {
        case String:value=optionValue; break;
        case Short: value=Short.valueOf( optionValue ); break;
        case Int: value = Integer.valueOf( optionValue ); break;
        case Long: value = Long.valueOf( optionValue ); break;
        case Double: value = Double.valueOf( optionValue ); break;
        case Boolean: value = Boolean.valueOf( optionValue ); break;
        default:
          throw new RuntimeException( "Config option parameter "+configOption.getParameter()+" uses unsupported type "+configOption.getType().name() );
      }
      consumerConfigUpdates.put(configOption.getParameter(), value);
    }

    KafkaIO.Read<String, String> io = KafkaIO.<String, String>read()
      .withBootstrapServers( bootstrapServers )
      .withConsumerConfigUpdates( consumerConfigUpdates )
      .withTopics( topicList )
      .withKeyDeserializer( StringDeserializer.class )
      .withValueDeserializer( StringDeserializer.class );

    if (usingProcessingTime) {
      io = io.withProcessingTime();
    }
    if (usingLogAppendTime) {
      io = io.withLogAppendTime();
    }
    if (usingCreateTime) {
      io = io.withCreateTime( Duration.ZERO ); // TODO Configure this
    }

    if (restrictedToCommitted) {
      io = io.withReadCommitted();
    }
    if (allowingCommitOnConsumedOffset) {
      io = io.commitOffsetsInFinalize();
    }

    // Read keys and values from Kafka
    //
    PCollection<KV<String, String>> kafkaConsumerOutput = input.apply( io.withoutMetadata() );

    // Now convert this into Kettle rows with a single String value in them
    //
    PCollection<KettleRow> output = kafkaConsumerOutput.apply(
      ParDo.of(new KVStringStringToKettleRowFn( stepname, rowMetaJson, stepPluginClasses, xpPluginClasses ))
    );

    return output;

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in Kafka input transform", e );
    throw new RuntimeException( "Error in Kafka input transform", e );
  }
}
 
Example 20
Source File: BigQueryTable.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Row> buildIOReader(PBegin begin) {
  return begin.apply("Read Input BQ Rows", getBigQueryTypedRead(getSchema()));
}