org.apache.beam.sdk.values.KV Java Examples

The following examples show how to use org.apache.beam.sdk.values.KV. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ChangelogTableDynamicDestinations.java    From DataflowTemplates with Apache License 2.0 7 votes vote down vote up
@Override
public TableSchema getSchema(String targetTable) {
  Map<String, KV<Schema, Schema>> schemaMap = this.sideInput(schemaMapView);
  KV<Schema, Schema> keyAndValueSchemas = schemaMap.get(targetTable);

  TableFieldSchema rowSchema = new TableFieldSchema()
      .setName("fullRecord")
      .setType("RECORD")
      .setMode("NULLABLE")   // This field is null for deletions
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getValue()).getFields());

  TableFieldSchema pkSchema = new TableFieldSchema()
      .setName("primaryKey")
      .setType("RECORD")
      .setFields(BigQueryUtils.toTableSchema(keyAndValueSchemas.getKey()).getFields());

  TableSchema changelogTableSchema = new TableSchema()
      .setFields(Arrays.asList(
          rowSchema,
          pkSchema,
          new TableFieldSchema().setName("operation").setType("STRING"),
          new TableFieldSchema().setName("timestampMs").setType("INT64"),
          new TableFieldSchema().setName("tableName").setType("STRING")));

  return changelogTableSchema;
}
 
Example #2
Source File: KVStringStringToKettleRowFn.java    From kettle-beam with Apache License 2.0 7 votes vote down vote up
@ProcessElement
public void processElement( ProcessContext processContext ) {
  try {

    KV<String,String> kv = processContext.element();
    inputCounter.inc();

    Object[] outputRow = RowDataUtil.allocateRowData( rowMeta.size() );
    outputRow[ 0 ] = kv.getKey(); // String
    outputRow[ 1 ] = kv.getValue(); // String

    processContext.output( new KettleRow( outputRow ) );
    writtenCounter.inc();

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in KV<Long,String> to Kettle Row conversion function", e );
    throw new RuntimeException( "Error in KV<Long,String> to Kettle Row conversion function", e );
  }
}
 
Example #3
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> file) throws IOException {
  long totalBytes = file.getValue().getMetadata().sizeBytes();
  long totalSplit = 0;
  if (totalBytes < BATCH_SIZE) {
    totalSplit = 2;
  } else {
    totalSplit = totalSplit + (totalBytes / BATCH_SIZE);
    long remaining = totalBytes % BATCH_SIZE;
    if (remaining > 0) {
      totalSplit = totalSplit + 2;
    }
  }

  LOG.debug(
      "Total Bytes {} for File {} -Initial Restriction range from 1 to: {}",
      totalBytes,
      file.getKey(),
      totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example #4
Source File: KafkaToBigQuery.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  FailsafeElement<KV<String, String>, String> failsafeElement = context.element();
  final KV<String, String> message = failsafeElement.getOriginalPayload();

  // Format the timestamp for insertion
  String timestamp =
      TIMESTAMP_FORMATTER.print(context.timestamp().toDateTime(DateTimeZone.UTC));

  // Build the table row
  final TableRow failedRow =
      new TableRow()
          .set("timestamp", timestamp)
          .set("errorMessage", failsafeElement.getErrorMessage())
          .set("stacktrace", failsafeElement.getStacktrace());

  // Only set the payload if it's populated on the message.
  failedRow.set(
      "payloadString",
      "key: "
          + (message.getKey() == null ? "" : message.getKey())
          + "value: "
          + (message.getValue() == null ? "" : message.getValue()));
  context.output(failedRow);
}
 
Example #5
Source File: DLPTextToBigQueryStreaming.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public TableSchema getSchema(KV<String, TableRow> destination) {

  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example #6
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
@Override
public TableSchema getSchema(KV<String, TableRow> destination) {
  TableRow bqRow = destination.getValue();
  TableSchema schema = new TableSchema();
  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  List<TableCell> cells = bqRow.getF();
  for (int i = 0; i < cells.size(); i++) {
    Map<String, Object> object = cells.get(i);
    String header = object.keySet().iterator().next();
    /** currently all BQ data types are set to String */
    fields.add(new TableFieldSchema().setName(checkHeaderName(header)).setType("STRING"));
  }

  schema.setFields(fields);
  return schema;
}
 
Example #7
Source File: ExampleEchoPipelineTest.java    From deployment-examples with MIT License 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
  try {
    // Our Library takes a single command in position 0 which it will echo back in the result
    SubProcessCommandLineArgs commands = new SubProcessCommandLineArgs();
    Command command = new Command(0, String.valueOf(c.element().getValue()));
    commands.putCommand(command);

    // The ProcessingKernel deals with the execution of the process
    SubProcessKernel kernel = new SubProcessKernel(configuration, binaryName);

    // Run the command and work through the results
    List<String> results = kernel.exec(commands);
    for (String s : results) {
      c.output(KV.of(c.element().getKey(), s));
    }
  } catch (Exception ex) {
    LOG.error("Error processing element ", ex);
    throw ex;
  }
}
 
Example #8
Source File: MergeStatementBuildingFn.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(
    ProcessContext c,
    @StateId("table_created") ValueState<Boolean> tableCreated) {
  KV<String, KV<Schema, Schema>> tableAndSchemas = c.element();

  // Start by actually fetching whether we created the table or not.
  if (!createdCache) {
    Boolean actuallyCreated = firstNonNull(tableCreated.read(), false);
    createdCache = actuallyCreated;
  }
  // Once we know for sure if we created the table, then we act on creating (or not).
  if (!createdCache) {
    tableCreated.write(true);
    createdCache = true;
    c.output(KV.of(tableAndSchemas.getKey(),
        buildCreateTableAction(tableAndSchemas, projectId, replicaDatasetId)));
    this.tablesCreated.inc();
  }

  c.output(KV.of(tableAndSchemas.getKey(),
      buildMergeStatementAction(
          tableAndSchemas, projectId, changelogDatasetId, replicaDatasetId)));
  this.mergeStatementsIssued.inc();
}
 
Example #9
Source File: CSVContentProcessorDoFn.java    From dlp-dataflow-deidentification with Apache License 2.0 6 votes vote down vote up
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, List<String>> contents) {

  this.numberOfRows = contents.getValue().size() - 1;
  int totalSplit = 0;
  totalSplit = this.numberOfRows / this.batchSize.get().intValue();
  int remaining = this.numberOfRows % this.batchSize.get().intValue();
  if (remaining > 0) {
    totalSplit = totalSplit + 2;

  } else {
    totalSplit = totalSplit + 1;
  }
  LOG.info("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example #10
Source File: BulkDecompressor.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext context) {
  ResourceId inputFile = context.element().resourceId();

  // Output a record to the failure file if the file doesn't match a known compression.
  if (!Compression.AUTO.isCompressed(inputFile.toString())) {
    String errorMsg =
        String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS);

    context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg));
  } else {
    try {
      ResourceId outputFile = decompress(inputFile);
      context.output(outputFile.toString());
    } catch (IOException e) {
      LOG.error(e.getMessage());
      context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage()));
    }
  }
}
 
Example #11
Source File: StatefulTeamScoreTest.java    From deployment-examples with MIT License 6 votes vote down vote up
/**
 * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs
 * correctly for one team.
 */
@Test
public void testScoreUpdatesOneTeam() {

  TestStream<KV<String, GameActionInfo>> createEvents =
      TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class)))
          .advanceWatermarkTo(baseTime)
          .addElements(
              event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)),
              event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)),
              event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)),
              event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)),
              event(TestUser.RED_TWO, 201, Duration.standardSeconds(50)))
          .advanceWatermarkToInfinity();

  PCollection<KV<String, Integer>> teamScores =
      p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100)));

  String redTeam = TestUser.RED_ONE.getTeam();

  PAssert.that(teamScores)
      .inWindow(GlobalWindow.INSTANCE)
      .containsInAnyOrder(KV.of(redTeam, 100), KV.of(redTeam, 200), KV.of(redTeam, 401));

  p.run().waitUntilFinish();
}
 
Example #12
Source File: RedisFeatureSink.java    From feast with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<FeatureSetReference> prepareWrite(
    PCollection<KV<FeatureSetReference, FeatureSetProto.FeatureSetSpec>> featureSetSpecs) {
  if (getRedisConfig() != null) {
    RedisClient redisClient =
        RedisClient.create(
            RedisURI.create(getRedisConfig().getHost(), getRedisConfig().getPort()));
    try {
      redisClient.connect();
    } catch (RedisConnectionException e) {
      throw new RuntimeException(
          String.format(
              "Failed to connect to Redis at host: '%s' port: '%d'. Please check that your Redis is running and accessible from Feast.",
              getRedisConfig().getHost(), getRedisConfig().getPort()));
    }
    redisClient.shutdown();
  } else if (getRedisClusterConfig() == null) {
    throw new RuntimeException(
        "At least one RedisConfig or RedisClusterConfig must be provided to Redis Sink");
  }
  specsView = featureSetSpecs.apply(ParDo.of(new ReferenceToString())).apply(View.asMultimap());
  return featureSetSpecs.apply(Keys.create());
}
 
Example #13
Source File: BeamModel.java    From streamingbook with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(@Element KV<String, Integer> kv,
                           @Timestamp Instant timestamp,
                           BoundedWindow window,
                           PaneInfo pane,
                           OutputReceiver<String> output) {
    StringBuilder builder = new StringBuilder(String.format(
        "%s: %s:%-2d %s %-7s index=%d",
        Utils.formatWindow(window), kv.getKey(), kv.getValue(),
        Utils.formatTime(timestamp), pane.getTiming(), pane.getIndex()));
    if (pane.getNonSpeculativeIndex() > -1)
        builder.append(" onTimeIndex=" + pane.getNonSpeculativeIndex());
    if (pane.isFirst())
        builder.append(" isFirst");
    if (pane.isLast())
        builder.append(" isLast");
    output.output(builder.toString());
}
 
Example #14
Source File: ExportTransform.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  if (Objects.equals(c.element().getKey(), EMPTY_EXPORT_FILE)) {
    return;
  }
  Iterable<String> files = c.element().getValue();
  Iterator<String> it = files.iterator();
  boolean gcs = it.hasNext() && GcsPath.GCS_URI.matcher(it.next()).matches();
  TableManifest proto;
  if (gcs) {
    Iterable<GcsPath> gcsPaths = Iterables.transform(files, s -> GcsPath.fromUri(s));
    proto = buildGcsManifest(c, gcsPaths);
  } else {
    Iterable<Path> paths = Iterables.transform(files, s -> Paths.get(s));
    proto = buildLocalManifest(paths);
  }
  try {
    c.output(KV.of(c.element().getKey(), JsonFormat.printer().print(proto)));
  } catch (InvalidProtocolBufferException e) {
    throw new RuntimeException(e);
  }
}
 
Example #15
Source File: KettleRowToKVStringStringFn.java    From kettle-beam with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement( ProcessContext processContext ) {
  try {
    KettleRow kettleRow = processContext.element();
    inputCounter.inc();

    String key = rowMeta.getString(kettleRow.getRow(), keyIndex);
    String value = rowMeta.getString(kettleRow.getRow(), valueIndex);

    processContext.output( KV.of( key, value ) );
    writtenCounter.inc();

  } catch ( Exception e ) {
    numErrors.inc();
    LOG.error( "Error in KettleRow to KV<String,String> function", e );
    throw new RuntimeException( "Error in KettleRow to KV<String,String> function", e );
  }
}
 
Example #16
Source File: BigQueryMergerTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Test
public void testAutoValueMergeInfoClass() throws Exception {
  MergeInfo mergeInfo =
      MergeInfo.create(
          TIMESTAMP_META_FIELD,
          DELETED_META_FIELD,
          TABLE_1,
          TABLE_2,
          FULL_COLUMN_LIST,
          PRIMARY_KEY_COLUMNS);

  PCollection<KV<String, MergeInfo>> result =
      pipeline
          .apply(Create.of(mergeInfo))
          .apply(
              WithKeys.<String, MergeInfo>of(mi -> mi.getReplicaTable())
                  .withKeyType(TypeDescriptors.strings()))
          .apply(
              new TriggerPerKeyOnFixedIntervals<>(Duration.standardMinutes(WINDOW_SIZE_MINUTES)));

  PAssert.that(result).containsInAnyOrder(KV.of(mergeInfo.getReplicaTable(), mergeInfo));
  pipeline.run().waitUntilFinish();
}
 
Example #17
Source File: KafkaIO.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<KV<K, V>> input) {
  checkArgument(
      getProducerConfig().get(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG) != null,
      "withBootstrapServers() is required");
  checkArgument(getTopic() != null, "withTopic() is required");
  checkArgument(getKeySerializer() != null, "withKeySerializer() is required");
  checkArgument(getValueSerializer() != null, "withValueSerializer() is required");

  if (isEOS()) {
    KafkaExactlyOnceSink.ensureEOSSupport();

    // TODO: Verify that the group_id does not have existing state stored on Kafka unless
    //       this is an upgrade. This avoids issues with simple mistake of reusing group_id
    //       across multiple runs or across multiple jobs. This is checked when the sink
    //       transform initializes while processing the output. It might be better to
    //       check here to catch common mistake.

    input.apply(new KafkaExactlyOnceSink<>(this));
  } else {
    input.apply(ParDo.of(new KafkaWriter<>(this)));
  }
  return PDone.in(input.getPipeline());
}
 
Example #18
Source File: WordCount.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Static method to generate the word count Beam pipeline.
 * @param options options for the pipeline.
 * @param inputFilePath the input file path.
 * @param outputFilePath the output file path.
 * @return the generated pipeline.
 */
static Pipeline generateWordCountPipeline(final PipelineOptions options,
                                                 final String inputFilePath, final String outputFilePath) {
  final Pipeline p = Pipeline.create(options);
  final PCollection<String> result = GenericSourceSink.read(p, inputFilePath)
    .apply(MapElements.<String, KV<String, Long>>via(new SimpleFunction<String, KV<String, Long>>() {
      @Override
      public KV<String, Long> apply(final String line) {
        final String[] words = line.split(" +");
        final String documentId = words[0] + "#" + words[1];
        final Long count = Long.parseLong(words[2]);
        return KV.of(documentId, count);
      }
    }))
    .apply(Sum.longsPerKey())
    .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
      @Override
      public String apply(final KV<String, Long> kv) {
        return kv.getKey() + ": " + kv.getValue();
      }
    }));
  GenericSourceSink.write(result, outputFilePath);
  return p;
}
 
Example #19
Source File: SplunkEventWriter.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@ProcessElement
public void processElement(
    @Element KV<Integer, SplunkEvent> input,
    OutputReceiver<SplunkWriteError> receiver,
    BoundedWindow window,
    @StateId(BUFFER_STATE_NAME) BagState<SplunkEvent> bufferState,
    @StateId(COUNT_STATE_NAME) ValueState<Long> countState,
    @TimerId(TIME_ID_NAME) Timer timer) throws IOException {

  Long count = MoreObjects.<Long>firstNonNull(countState.read(), 0L);
  SplunkEvent event = input.getValue();
  INPUT_COUNTER.inc();
  bufferState.add(event);
  count += 1;
  countState.write(count);
  timer.offset(Duration.standardSeconds(DEFAULT_FLUSH_DELAY)).setRelative();

  if (count >= batchCount) {

    LOG.info("Flushing batch of {} events", count);
    flush(receiver, bufferState, countState);
  }
}
 
Example #20
Source File: LeaderBoard.java    From deployment-examples with MIT License 5 votes vote down vote up
/**
 * Create a map of information that describes how to write pipeline output to BigQuery. This map
 * is used to write user score sums.
 */
protected static Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>>
    configureGlobalWindowBigQueryWrite() {

  Map<String, WriteToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure =
      configureBigQueryWrite();
  tableConfigure.put(
      "processing_time",
      new WriteToBigQuery.FieldInfo<>(
          "STRING", (c, w) -> GameConstants.DATE_TIME_FORMATTER.print(Instant.now())));
  return tableConfigure;
}
 
Example #21
Source File: KafkaRecordCoder.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
private Iterable<KV<String, byte[]>> toIterable(KafkaRecord record) {
  if (!ConsumerSpEL.hasHeaders) {
    return Collections.emptyList();
  }

  List<KV<String, byte[]>> vals = new ArrayList<>();
  for (Header header : record.getHeaders()) {
    vals.add(KV.of(header.key(), header.value()));
  }
  return vals;
}
 
Example #22
Source File: MergeStatementBuildingFnTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Test
public void testTablesBuiltInPipeline() {
  Pipeline p = Pipeline.create();

  PCollection<KV<String, KV<Schema, Schema>>> tableSchemaS =
      p.apply(Create.of(
          KV.of(TABLE_1_NAME, KV.of(TABLE_1_PK_SCHEMA, TABLE_1_SCHEMA)),
          KV.of(TABLE_2_NAME, KV.of(TABLE_2_PK_SCHEMA, TABLE_2_SCHEMA)),
          KV.of(TABLE_1_NAME, KV.of(TABLE_1_PK_SCHEMA, TABLE_1_SCHEMA))));

  PCollection<KV<String, BigQueryAction>> statementsIssued =
  tableSchemaS
      .apply(ParDo.of(
          new MergeStatementBuildingFn(CHANGELOG_DATASET_ID, REPLICA_DATASET_ID, PROJECT_ID)));

  PCollection<KV<String, Long>>  tablesCreatedCount = statementsIssued
      .apply("GetCreateActions",
          Filter.by(input -> input.getValue().action.equals(BigQueryAction.CREATE_TABLE)))
      .apply("CountCreateActions", Count.perKey());

  PCollection<KV<String, Long>>  tablesMerged = statementsIssued
      .apply("GetMergeActions",
          Filter.by(input -> input.getValue().action.equals(BigQueryAction.STATEMENT)))
      .apply("CountMergeActions", Count.perKey());

  PAssert.that(tablesCreatedCount)
      .containsInAnyOrder(
          KV.of(TABLE_1_NAME, 1L),
          KV.of(TABLE_2_NAME, 1L));

  PAssert.that(tablesMerged)
      .containsInAnyOrder(
          KV.of(TABLE_1_NAME, 2L),
          KV.of(TABLE_2_NAME, 1L));

  p.run().waitUntilFinish();
}
 
Example #23
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@SplitRestriction
public void splitRestriction(
    KV<String, ReadableFile> file, OffsetRange range, OutputReceiver<OffsetRange> out) {

  for (final OffsetRange p : range.split(1, 1)) {
    out.output(p);
  }
}
 
Example #24
Source File: CreateViewTransform.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public void onData(final WindowedValue<KV<?, I>> element) {
  // The key of element is always null (beam's semantic)
  // because view is a globally materialized data regardless of key
  for (final BoundedWindow window : element.getWindows()) {
    windowListMap.putIfAbsent(window, new ArrayList<>());
    final List<I> list = windowListMap.get(window);
    list.add(element.getValue().getValue());
  }
}
 
Example #25
Source File: ImportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c) {
  KV<String, TableManifest> kv = c.element();
  String table = kv.getKey();
  TableManifest manifest = kv.getValue();
  boolean gcs = GcsPath.GCS_URI.matcher(importDirectory.get()).matches();
  if (gcs) {
    validateGcsFiles(c, table, manifest);
  } else {
    validateLocalFiles(c, table, manifest);
  }
}
 
Example #26
Source File: BeamKeyExtractor.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Object extractKey(final Object element) {
  final Object valueToExtract = element instanceof WindowedValue ? ((WindowedValue) element).getValue() : element;
  if (valueToExtract instanceof KV) {
    // Handle null keys, since Beam allows KV with null keys.
    final Object key = ((KV) valueToExtract).getKey();
    return key == null ? 0 : key;
  } else {
    return element;
  }
}
 
Example #27
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
/**
 * SDF needs to define a @GetInitialRestriction method that can create a restriction describing
 * the complete work for a given element. For our case this would be the total number of rows
 * for each CSV file. We will calculate the number of split required based on total number of
 * rows and batch size provided.
 *
 * @throws IOException
 */
//
@GetInitialRestriction
public OffsetRange getInitialRestriction(KV<String, ReadableFile> csvFile) throws IOException {

  int rowCount = 0;
  int totalSplit = 0;
  try (BufferedReader br = getReader(csvFile.getValue())) {
    /** assume first row is header */
    int checkRowCount = (int) br.lines().count() - 1;
    rowCount = (checkRowCount < 1) ? 1 : checkRowCount;
    totalSplit = rowCount / batchSize.get().intValue();
    int remaining = rowCount % batchSize.get().intValue();
    /**
     * Adjusting the total number of split based on remaining rows. For example: batch size of
     * 15 for 100 rows will have total 7 splits. As it's a range last split will have offset
     * range {7,8}
     */
    if (remaining > 0) {
      totalSplit = totalSplit + 2;

    } else {
      totalSplit = totalSplit + 1;
    }
  }

  LOG.debug("Initial Restriction range from 1 to: {}", totalSplit);
  return new OffsetRange(1, totalSplit);
}
 
Example #28
Source File: WriteToGCSText.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to String using DoFn.
       */
      .apply(
          "Converting to String",
          ParDo.of(
              new DoFn<KV<String, String>, String>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  c.output(c.element().getValue());
                }
              }))
      /*
       * Writing as text file using {@link TextIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Text",
          TextIO.write()
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.TEXT)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}
 
Example #29
Source File: EmptyComponents.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Object extractKey(final Object element) {
  if (element instanceof KV) {
    // Handle null keys, since Beam allows KV with null keys.
    final Object key = ((KV) element).getKey();
    return key == null ? 0 : key;
  } else {
    return element;
  }
}
 
Example #30
Source File: BigQueryMappers.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public KV<TableId, TableRow> getOutputObject(TableRow input) {
  TableId tableId = getTableId(input);
  TableRow tableRow = getTableRow(input);

  return KV.of(tableId, tableRow);
}