Python apache_beam.ptransform_fn() Examples

The following are 11 code examples of apache_beam.ptransform_fn(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module apache_beam , or try the search function .
Example #1
Source File: base_example_gen_executor.py    From tfx with Apache License 2.0 6 votes vote down vote up
def GetInputSourceToExamplePTransform(self) -> beam.PTransform:
    """Returns PTransform for converting input source to records.

    The record is by default assumed to be tf.train.Example protos, subclassses
    can serialize any protocol buffer into bytes as output PCollection,
    so long as the downstream component can consume it.

    Note that each input split will be transformed by this function separately.
    For complex use case, consider override 'GenerateExamplesByBeam' instead.

    Here is an example PTransform:
      @beam.ptransform_fn
      @beam.typehints.with_input_types(beam.Pipeline)
      @beam.typehints.with_output_types(Union[tf.train.Example,
                                              tf.train.SequenceExample,
                                              bytes])
      def ExamplePTransform(
          pipeline: beam.Pipeline,
          exec_properties: Dict[Text, Any],
          split_pattern: Text) -> beam.pvalue.PCollection
    """
    pass 
Example #2
Source File: tf_example_record.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return (raw_records_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "Decode" >> beam.ParDo(
                  _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(),
                                           self.raw_record_column_name,
                                           self._can_produce_large_types)))

    return beam.ptransform_fn(_PTransformFn)() 
Example #3
Source File: tf_sequence_example_record.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return (raw_records_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "Decode" >> beam.ParDo(
                  _DecodeBatchExamplesDoFn(self._schema,
                                           self.raw_record_column_name,
                                           self._can_produce_large_types)))

    return beam.ptransform_fn(_PTransformFn)() 
Example #4
Source File: record_based_tfxio.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def RawRecordToRecordBatch(self,
                             batch_size: Optional[int] = None
                            ) -> beam.PTransform:
    """Returns a PTransform that converts raw records to Arrow RecordBatches.

    The input PCollection must be from self.RawRecordBeamSource() (also see
    the documentation for that method).

    Args:
      batch_size: if not None, the `pa.RecordBatch` produced will be of the
        specified size. Otherwise it's automatically tuned by Beam.
    """

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(pcoll: beam.pvalue.PCollection):
      return (pcoll
              | "RawRecordToRecordBatch" >>
              self._RawRecordToRecordBatchInternal(batch_size)
              | "CollectRecordBatchTelemetry" >>
              telemetry.ProfileRecordBatches(self._telemetry_descriptors,
                                             self._logical_format,
                                             self._physical_format))

    return beam.ptransform_fn(_PTransformFn)() 
Example #5
Source File: tft_unit.py    From transform with Apache License 2.0 5 votes vote down vote up
def _RawRecordBeamSourceInternal(self):
    """A PTransform that maps batched instances to RecordBatches."""
    @beam.ptransform_fn
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _ptransform_fn(instances):
      return (instances
              | 'EncodeToTfExamples' >> beam.Map(
                  example_proto_coder.ExampleProtoCoder(self._schema).encode))

    return _ptransform_fn()  # pylint: disable=no-value-for-parameter

  # TODO(b/156761358): deprecated; remove after tfx-bsl 0.23 release. 
Example #6
Source File: test_util.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _RawRecordBeamSourceInternal(self) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(bytes)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return raw_records_pcoll

    return beam.ptransform_fn(_PTransformFn)() 
Example #7
Source File: raw_tf_record.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
      return (raw_record_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "ToRecordBatch" >>
              beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name,
                       self._can_produce_large_types))

    return beam.ptransform_fn(_PTransformFn)() 
Example #8
Source File: raw_tf_record.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _RawRecordBeamSourceInternal(self) -> beam.PTransform:

    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(bytes)
    def _PTransformFn(pipeline: beam.pvalue.PCollection):
      return pipeline | "ReadFromTFRecord" >> beam.io.ReadFromTFRecord(
          self._file_pattern,
          coder=beam.coders.BytesCoder(),
          # TODO(b/114938612): Eventually remove this override.
          validate=False)

    return beam.ptransform_fn(_PTransformFn)() 
Example #9
Source File: record_based_tfxio.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:

    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(pipeline: beam.pvalue.PCollection):
      """Converts raw records to RecordBatches."""
      return (
          pipeline
          | "RawRecordBeamSource" >> self.RawRecordBeamSource()
          | "RawRecordToRecordBatch" >> self.RawRecordToRecordBatch(batch_size))

    return beam.ptransform_fn(_PTransformFn)() 
Example #10
Source File: csv_tfxio.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(List[bytes])
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      """Returns RecordBatch of csv lines."""

      # Decode raw csv lines to record batches.
      record_batches = (
          raw_records_pcoll
          | "CSVToRecordBatch" >> csv_decoder.CSVToRecordBatch(
              column_names=self._column_names,
              delimiter=self._delimiter,
              skip_blank_lines=self._skip_blank_lines,
              schema=self._schema,
              desired_batch_size=batch_size,
              multivalent_columns=self._multivalent_columns,
              secondary_delimiter=self._secondary_delimiter,
              produce_large_types=self._can_produce_large_types,
              raw_record_column_name=self._raw_record_column_name))

      return record_batches

    return beam.ptransform_fn(_PTransformFn)() 
Example #11
Source File: executor.py    From tfx with Apache License 2.0 4 votes vote down vote up
def GetInputSourceToExamplePTransform(self) -> beam.PTransform:
    """Returns PTransform for importing records."""

    @beam.ptransform_fn
    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(Union[tf.train.Example,
                                            tf.train.SequenceExample, bytes])
    def ImportRecord(pipeline: beam.Pipeline, exec_properties: Dict[Text, Any],
                     split_pattern: Text) -> beam.pvalue.PCollection:
      """PTransform to import records.

      The records are tf.train.Example, tf.train.SequenceExample,
      or serialized proto.

      Args:
        pipeline: Beam pipeline.
        exec_properties: A dict of execution properties.
          - input_base: input dir that contains input data.
        split_pattern: Split.pattern in Input config, glob relative file pattern
          that maps to input files with root directory given by input_base.

      Returns:
        PCollection of records (tf.Example, tf.SequenceExample, or bytes).
      """
      output_payload_format = exec_properties.get(utils.OUTPUT_DATA_FORMAT_KEY)

      serialized_records = (
          pipeline
          # pylint: disable=no-value-for-parameter
          | _ImportSerializedRecord(exec_properties, split_pattern))
      if output_payload_format == example_gen_pb2.PayloadFormat.FORMAT_PROTO:
        return serialized_records
      elif (output_payload_format ==
            example_gen_pb2.PayloadFormat.FORMAT_TF_EXAMPLE):
        return (serialized_records
                | 'ToTFExample' >> beam.Map(tf.train.Example.FromString))
      elif (output_payload_format ==
            example_gen_pb2.PayloadFormat.FORMAT_TF_SEQUENCE_EXAMPLE):
        return (serialized_records
                | 'ToTFSequenceExample' >> beam.Map(
                    tf.train.SequenceExample.FromString))

      raise ValueError('output_payload_format must be one of FORMAT_TF_EXAMPLE,'
                       ' FORMAT_TF_SEQUENCE_EXAMPLE or FORMAT_PROTO')

    return ImportRecord