Java Code Examples for org.apache.beam.sdk.values.KV

The following examples show how to use org.apache.beam.sdk.values.KV. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Override
public TableSchema getSchema(String targetTable) {
  Map<String, KV<Schema, Schema>> schemaMap = this.sideInput(schemaMapView);
  KV<Schema, Schema> keyAndValueSchemas = schemaMap.get(targetTable);

  TableFieldSchema rowSchema = new TableFieldSchema()
      .setName("fullRecord")
      .setType("RECORD")
      .setMode("NULLABLE")   // This field is null for deletions
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getValue()).getFields());

  TableFieldSchema pkSchema = new TableFieldSchema()
      .setName("primaryKey")
      .setType("RECORD")
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getKey()).getFields());

  TableSchema changelogTableSchema = new TableSchema()
      .setFields(Arrays.asList(
          rowSchema,
          pkSchema,
          new TableFieldSchema().setName("operation").setType("STRING"),
          new TableFieldSchema().setName("timestampMs").setType("INT64"),
          new TableFieldSchema().setName("tableName").setType("STRING")));

  return changelogTableSchema;
}
 
Example 2
Source Project: kettle-beam   Source File: KVStringStringToKettleRowFn.java    License: Apache License 2.0 7 votes vote down vote up
@ProcessElement
public void processElement( ProcessContext processContext ) {
  try {

    KV<String,String> kv = processContext.element();
    inputCounter.inc();

    Object[] outputRow = RowDataUtil.allocateRowData( rowMeta.size() );
    outputRow[ 0 ] = kv.getKey(); // String
    outputRow[ 1 ] = kv.getValue(); // String

    processContext.output( new KettleRow( outputRow ) );
    writtenCounter.inc();

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in KV<Long,String> to Kettle Row conversion function", e );
    throw new RuntimeException( "Error in KV<Long,String> to Kettle Row conversion function", e );
  }
}
 
Example 3
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 6 votes vote down vote up
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> file) throws IOException {
  long totalBytes = file.getValue().getMetadata().sizeBytes();
  long totalSplit = 0;
  if (totalBytes < BATCH_SIZE) {
    totalSplit = 2;
  } else {
    totalSplit = totalSplit + (totalBytes / BATCH_SIZE);
    long remaining = totalBytes % BATCH_SIZE;
    if (remaining > 0) {
      totalSplit = totalSplit + 2;
    }
  }

  LOG.debug(
      "Total Bytes {} for File {} -Initial Restriction range from 1 to: {}",
      totalBytes,
      file.getKey(),
      totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example 4
Source Project: DataflowTemplates   Source File: KafkaToBigQuery.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  FailsafeElement<KV<String, String>, String> failsafeElement = context.element();
  final KV<String, String> message = failsafeElement.getOriginalPayload();

  // Format the timestamp for insertion
  String timestamp =
      TIMESTAMP_FORMATTER.print(context.timestamp().toDateTime(DateTimeZone.UTC));

  // Build the table row
  final TableRow failedRow =
      new TableRow()
          .set("timestamp", timestamp)
          .set("errorMessage", failsafeElement.getErrorMessage())
          .set("stacktrace", failsafeElement.getStacktrace());

  // Only set the payload if it's populated on the message.
  failedRow.set(
      "payloadString",
      "key: "
          + (message.getKey() == null ? "" : message.getKey())
          + "value: "
          + (message.getValue() == null ? "" : message.getValue()));
  context.output(failedRow);
}
 
Example 5
@Override
public TableSchema getSchema(KV<String, TableRow> destination) {

  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example 6
@Override
public TableSchema getSchema(KV<String, TableRow> destination) {
  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example 7
Source Project: deployment-examples   Source File: ExampleEchoPipelineTest.java    License: MIT License 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  try {
    // Our Library takes a single command in position 0 which it will echo back in the result
    SubProcessCommandLineArgs commands = new SubProcessCommandLineArgs();
    Command command = new Command(0, String.valueOf(c.element().getValue()));
    commands.putCommand(command);

    // The ProcessingKernel deals with the execution of the process
    SubProcessKernel kernel = new SubProcessKernel(configuration, binaryName);

    // Run the command and work through the results
    List<String> results = kernel.exec(commands);
    for (String s : results) {
      c.output(KV.of(c.element().getKey(), s));
    }
  } catch (Exception ex) {
    LOG.error("Error processing element ", ex);
    throw ex;
  }
}
 
Example 8
@ProcessElement
public void processElement(
    ProcessContext c,
    @StateId("table_created") ValueState<Boolean> tableCreated) {
  KV<String, KV<Schema, Schema>> tableAndSchemas = c.element();

  // Start by actually fetching whether we created the table or not.
  if (!createdCache) {
    Boolean actuallyCreated = firstNonNull(tableCreated.read(), false);
    createdCache = actuallyCreated;
  }
  // Once we know for sure if we created the table, then we act on creating (or not).
  if (!createdCache) {
    tableCreated.write(true);
    createdCache = true;
    c.output(KV.of(tableAndSchemas.getKey(),
        buildCreateTableAction(tableAndSchemas, projectId, replicaDatasetId)));
    this.tablesCreated.inc();
  }

  c.output(KV.of(tableAndSchemas.getKey(),
      buildMergeStatementAction(
          tableAndSchemas, projectId, changelogDatasetId, replicaDatasetId)));
  this.mergeStatementsIssued.inc();
}
 
Example 9
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, List<String>> contents) {

  this.numberOfRows = contents.getValue().size() - 1;
  int totalSplit = 0;
  totalSplit = this.numberOfRows / this.batchSize.get().intValue();
  int remaining = this.numberOfRows % this.batchSize.get().intValue();
  if (remaining > 0) {
    totalSplit = totalSplit + 2;

  } else {
    totalSplit = totalSplit + 1;
  }
  LOG.info("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example 10
Source Project: DataflowTemplates   Source File: BulkDecompressor.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  ResourceId inputFile = context.element().resourceId();

  // Output a record to the failure file if the file doesn't match a known compression.
  if (!Compression.AUTO.isCompressed(inputFile.toString())) {
    String errorMsg =
        String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS);

    context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg));
  } else {
    try {
      ResourceId outputFile = decompress(inputFile);
      context.output(outputFile.toString());
    } catch (IOException e) {
      LOG.error(e.getMessage());
      context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage()));
    }
  }
}
 
Example 11
Source Project: deployment-examples   Source File: StatefulTeamScoreTest.java    License: MIT License 6 votes vote down vote up
/**
 * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs
 * correctly for one team.
 */
@Test
public void testScoreUpdatesOneTeam() {

  TestStream<KV<String, GameActionInfo>> createEvents =
      TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class)))
          .advanceWatermarkTo(baseTime)
          .addElements(
              event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)),
              event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)),
              event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)),
              event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)),
              event(TestUser.RED_TWO, 201, Duration.standardSeconds(50)))
          .advanceWatermarkToInfinity();

  PCollection<KV<String, Integer>> teamScores =
      p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100)));

  String redTeam = TestUser.RED_ONE.getTeam();

  PAssert.that(teamScores)
      .inWindow(GlobalWindow.INSTANCE)
      .containsInAnyOrder(KV.of(redTeam, 100), KV.of(redTeam, 200), KV.of(redTeam, 401));

  p.run().waitUntilFinish();
}
 
Example 12
Source Project: feast   Source File: RedisFeatureSink.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<FeatureSetReference> prepareWrite(
    PCollection<KV<FeatureSetReference, FeatureSetProto.FeatureSetSpec>> featureSetSpecs) {
  if (getRedisConfig() != null) {
    RedisClient redisClient =
        RedisClient.create(
            RedisURI.create(getRedisConfig().getHost(), getRedisConfig().getPort()));
    try {
      redisClient.connect();
    } catch (RedisConnectionException e) {
      throw new RuntimeException(
          String.format(
              "Failed to connect to Redis at host: '%s' port: '%d'. Please check that your Redis is running and accessible from Feast.",
              getRedisConfig().getHost(), getRedisConfig().getPort()));
    }
    redisClient.shutdown();
  } else if (getRedisClusterConfig() == null) {
    throw new RuntimeException(
        "At least one RedisConfig or RedisClusterConfig must be provided to Redis Sink");
  }
  specsView = featureSetSpecs.apply(ParDo.of(new ReferenceToString())).apply(View.asMultimap());
  return featureSetSpecs.apply(Keys.create());
}
 
Example 13
Source Project: streamingbook   Source File: BeamModel.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element KV<String, Integer> kv,
                           @Timestamp Instant timestamp,
                           BoundedWindow window,
                           PaneInfo pane,
                           OutputReceiver<String> output) {
    StringBuilder builder = new StringBuilder(String.format(
        "%s: %s:%-2d %s %-7s index=%d",
        Utils.formatWindow(window), kv.getKey(), kv.getValue(),
        Utils.formatTime(timestamp), pane.getTiming(), pane.getIndex()));
    if (pane.getNonSpeculativeIndex() > -1)
        builder.append(" onTimeIndex=" + pane.getNonSpeculativeIndex());
    if (pane.isFirst())
        builder.append(" isFirst");
    if (pane.isLast())
        builder.append(" isLast");
    output.output(builder.toString());
}
 
Example 14
Source Project: DataflowTemplates   Source File: ExportTransform.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  if (Objects.equals(c.element().getKey(), EMPTY_EXPORT_FILE)) {
    return;
  }
  Iterable<String> files = c.element().getValue();
  Iterator<String> it = files.iterator();
  boolean gcs = it.hasNext() && GcsPath.GCS_URI.matcher(it.next()).matches();
  TableManifest proto;
  if (gcs) {
    Iterable<GcsPath> gcsPaths = Iterables.transform(files, s -> GcsPath.fromUri(s));
    proto = buildGcsManifest(c, gcsPaths);
  } else {
    Iterable<Path> paths = Iterables.transform(files, s -> Paths.get(s));
    proto = buildLocalManifest(paths);
  }
  try {
    c.output(KV.of(c.element().getKey(), JsonFormat.printer().print(proto)));
  } catch (InvalidProtocolBufferException e) {
    throw new RuntimeException(e);
  }
}
 
Example 15
Source Project: kettle-beam   Source File: KettleRowToKVStringStringFn.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement( ProcessContext processContext ) {
  try {
    KettleRow kettleRow = processContext.element();
    inputCounter.inc();

    String key = rowMeta.getString(kettleRow.getRow(), keyIndex);
    String value = rowMeta.getString(kettleRow.getRow(), valueIndex);

    processContext.output( KV.of( key, value ) );
    writtenCounter.inc();

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in KettleRow to KV<String,String> function", e );
    throw new RuntimeException( "Error in KettleRow to KV<String,String> function", e );
  }
}
 
Example 16
Source Project: DataflowTemplates   Source File: BigQueryMergerTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testAutoValueMergeInfoClass() throws Exception {
  MergeInfo mergeInfo =
      MergeInfo.create(
          TIMESTAMP_META_FIELD,
          DELETED_META_FIELD,
          TABLE_1,
          TABLE_2,
          FULL_COLUMN_LIST,
          PRIMARY_KEY_COLUMNS);

  PCollection<KV<String, MergeInfo>> result =
      pipeline
          .apply(Create.of(mergeInfo))
          .apply(
              WithKeys.<String, MergeInfo>of(mi -> mi.getReplicaTable())
                  .withKeyType(TypeDescriptors.strings()))
          .apply(
              new TriggerPerKeyOnFixedIntervals<>(Duration.standardMinutes(WINDOW_SIZE_MINUTES)));

  PAssert.that(result).containsInAnyOrder(KV.of(mergeInfo.getReplicaTable(), mergeInfo));
  pipeline.run().waitUntilFinish();
}
 
Example 17
Source Project: DataflowTemplates   Source File: KafkaIO.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<KV<K, V>> input) {
  checkArgument(
      getProducerConfig().get(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG) != null,
      "withBootstrapServers() is required");
  checkArgument(getTopic() != null, "withTopic() is required");
  checkArgument(getKeySerializer() != null, "withKeySerializer() is required");
  checkArgument(getValueSerializer() != null, "withValueSerializer() is required");

  if (isEOS()) {
    KafkaExactlyOnceSink.ensureEOSSupport();

    // TODO: Verify that the group_id does not have existing state stored on Kafka unless
    //       this is an upgrade. This avoids issues with simple mistake of reusing group_id
    //       across multiple runs or across multiple jobs. This is checked when the sink
    //       transform initializes while processing the output. It might be better to
    //       check here to catch common mistake.

    input.apply(new KafkaExactlyOnceSink<>(this));
  } else {
    input.apply(ParDo.of(new KafkaWriter<>(this)));
  }
  return PDone.in(input.getPipeline());
}
 
Example 18
Source Project: incubator-nemo   Source File: WordCount.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Static method to generate the word count Beam pipeline.
 * @param options options for the pipeline.
 * @param inputFilePath the input file path.
 * @param outputFilePath the output file path.
 * @return the generated pipeline.
 */
static Pipeline generateWordCountPipeline(final PipelineOptions options,
                                                 final String inputFilePath, final String outputFilePath) {
  final Pipeline p = Pipeline.create(options);
  final PCollection<String> result = GenericSourceSink.read(p, inputFilePath)
    .apply(MapElements.<String, KV<String, Long>>via(new SimpleFunction<String, KV<String, Long>>() {
      @Override
      public KV<String, Long> apply(final String line) {
        final String[] words = line.split(" +");
        final String documentId = words[0] + "#" + words[1];
        final Long count = Long.parseLong(words[2]);
        return KV.of(documentId, count);
      }
    }))
    .apply(Sum.longsPerKey())
    .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
      @Override
      public String apply(final KV<String, Long> kv) {
        return kv.getKey() + ": " + kv.getValue();
      }
    }));
  GenericSourceSink.write(result, outputFilePath);
  return p;
}
 
Example 19
Source Project: DataflowTemplates   Source File: SplunkEventWriter.java    License: Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(
    @Element KV<Integer, SplunkEvent> input,
    OutputReceiver<SplunkWriteError> receiver,
    BoundedWindow window,
    @StateId(BUFFER_STATE_NAME) BagState<SplunkEvent> bufferState,
    @StateId(COUNT_STATE_NAME) ValueState<Long> countState,
    @TimerId(TIME_ID_NAME) Timer timer) throws IOException {

  Long count = MoreObjects.<Long>firstNonNull(countState.read(), 0L);
  SplunkEvent event = input.getValue();
  INPUT_COUNTER.inc();
  bufferState.add(event);
  count += 1;
  countState.write(count);
  timer.offset(Duration.standardSeconds(DEFAULT_FLUSH_DELAY)).setRelative();

  if (count >= batchCount) {

    LOG.info("Flushing batch of {} events", count);
    flush(receiver, bufferState, countState);
  }
}
 
Example 20
Source Project: deployment-examples   Source File: LeaderBoard.java    License: MIT License 5 votes vote down vote up
/**
 * Create a map of information that describes how to write pipeline output to BigQuery. This map
 * is used to write user score sums.
 */
protected static Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>
    configureGlobalWindowBigQueryWrite() {

  Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
      configureBigQueryWrite();
  tableConfigure.put(
      "processing_time",
      new WriteToBigQuery.FieldInfo<>(
          "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now())));
  return tableConfigure;
}
 
Example 21
Source Project: DataflowTemplates   Source File: KafkaRecordCoder.java    License: Apache License 2.0 5 votes vote down vote up
private Iterable<KV<String, byte[]>> toIterable(KafkaRecord record) {
  if (!ConsumerSpEL.hasHeaders) {
    return Collections.emptyList();
  }

  List<KV<String, byte[]>> vals = new ArrayList<>();
  for (Header header : record.getHeaders()) {
    vals.add(KV.of(header.key(), header.value()));
  }
  return vals;
}
 
Example 22
@Test
public void testTablesBuiltInPipeline() {
  Pipeline p = Pipeline.create();

  PCollection<KV<String, KV<Schema, Schema>>> tableSchemaS =
      p.apply(Create.of(
          KV.of(TABLE_1_NAME, KV.of(TABLE_1_PK_SCHEMA, TABLE_1_SCHEMA)),
          KV.of(TABLE_2_NAME, KV.of(TABLE_2_PK_SCHEMA, TABLE_2_SCHEMA)),
          KV.of(TABLE_1_NAME, KV.of(TABLE_1_PK_SCHEMA, TABLE_1_SCHEMA))));

  PCollection<KV<String, BigQueryAction>> statementsIssued =
  tableSchemaS
      .apply(ParDo.of(
          new MergeStatementBuildingFn(CHANGELOG_DATASET_ID, REPLICA_DATASET_ID, PROJECT_ID)));

  PCollection<KV<String, Long>>  tablesCreatedCount = statementsIssued
      .apply("GetCreateActions",
          Filter.by(input -> input.getValue().action.equals(BigQueryAction.CREATE_TABLE)))
      .apply("CountCreateActions", Count.perKey());

  PCollection<KV<String, Long>>  tablesMerged = statementsIssued
      .apply("GetMergeActions",
          Filter.by(input -> input.getValue().action.equals(BigQueryAction.STATEMENT)))
      .apply("CountMergeActions", Count.perKey());

  PAssert.that(tablesCreatedCount)
      .containsInAnyOrder(
          KV.of(TABLE_1_NAME, 1L),
          KV.of(TABLE_2_NAME, 1L));

  PAssert.that(tablesMerged)
      .containsInAnyOrder(
          KV.of(TABLE_1_NAME, 2L),
          KV.of(TABLE_2_NAME, 1L));

  p.run().waitUntilFinish();
}
 
Example 23
Source Project: dlp-dataflow-deidentification   Source File: S3Import.java    License: Apache License 2.0 5 votes vote down vote up
@SplitRestriction
public void splitRestriction(
    KV<String, ReadableFile> file, OffsetRange range, OutputReceiver<OffsetRange> out) {

  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example 24
Source Project: incubator-nemo   Source File: CreateViewTransform.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void onData(final WindowedValue<KV<?, I>> element) {
  // The key of element is always null (beam's semantic)
  // because view is a globally materialized data regardless of key
  for (final BoundedWindow window : element.getWindows()) {
    windowListMap.putIfAbsent(window, new ArrayList<>());
    final List<I> list = windowListMap.get(window);
    list.add(element.getValue().getValue());
  }
}
 
Example 25
Source Project: DataflowTemplates   Source File: ImportTransform.java    License: Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  KV<String, TableManifest> kv = c.element();
  String table = kv.getKey();
  TableManifest manifest = kv.getValue();
  boolean gcs = GcsPath.GCS_URI.matcher(importDirectory.get()).matches();
  if (gcs) {
    validateGcsFiles(c, table, manifest);
  } else {
    validateLocalFiles(c, table, manifest);
  }
}
 
Example 26
Source Project: incubator-nemo   Source File: BeamKeyExtractor.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Object extractKey(final Object element) {
  final Object valueToExtract = element instanceof WindowedValue ? ((WindowedValue) element).getValue() : element;
  if (valueToExtract instanceof KV) {
    // Handle null keys, since Beam allows KV with null keys.
    final Object key = ((KV) valueToExtract).getKey();
    return key == null ? 0 : key;
  } else {
    return element;
  }
}
 
Example 27
/**
 * SDF needs to define a @GetInitialRestriction method that can create a restriction describing
 * the complete work for a given element. For our case this would be the total number of rows
 * for each CSV file. We will calculate the number of split required based on total number of
 * rows and batch size provided.
 *
 * @throws IOException
 */
//
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> csvFile) throws IOException {

  int rowCount = 0;
  int totalSplit = 0;
  try (BufferedReader br = getReader(csvFile.getValue())) {
    /** assume first row is header */
    int checkRowCount = (int) br.lines().count() - 1;
    rowCount = (checkRowCount < 1) ? 1 : checkRowCount;
    totalSplit = rowCount / batchSize.get().intValue();
    int remaining = rowCount % batchSize.get().intValue();
    /**
     * Adjusting the total number of split based on remaining rows. For example: batch size of
     * 15 for 100 rows will have total 7 splits. As it's a range last split will have offset
     * range {7,8}
     */
    if (remaining > 0) {
      totalSplit = totalSplit + 2;

    } else {
      totalSplit = totalSplit + 1;
    }
  }

  LOG.debug("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example 28
Source Project: DataflowTemplates   Source File: WriteToGCSText.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to String using DoFn.
       */
      .apply(
          "Converting to String",
          ParDo.of(
              new DoFn<KV<String, String>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().getValue());
                }
              }))
      /*
       * Writing as text file using {@link TextIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Text",
          TextIO.write()
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.TEXT)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}
 
Example 29
Source Project: incubator-nemo   Source File: EmptyComponents.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Object extractKey(final Object element) {
  if (element instanceof KV) {
    // Handle null keys, since Beam allows KV with null keys.
    final Object key = ((KV) element).getKey();
    return key == null ? 0 : key;
  } else {
    return element;
  }
}
 
Example 30
Source Project: DataflowTemplates   Source File: BigQueryMappers.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public KV<TableId, TableRow> getOutputObject(TableRow input) {
  TableId tableId = getTableId(input);
  TableRow tableRow = getTableRow(input);

  return KV.of(tableId, tableRow);
}