Java Code Examples for org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition#WRITE_TRUNCATE

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition#WRITE_TRUNCATE . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FakeJobService.java    From beam with Apache License 2.0 6 votes vote down vote up
private boolean validateDispositions(
    Table table, CreateDisposition createDisposition, WriteDisposition writeDisposition)
    throws InterruptedException, IOException {
  if (table == null) {
    if (createDisposition == CreateDisposition.CREATE_NEVER) {
      return false;
    }
  } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
    datasetService.deleteTable(table.getTableReference());
  } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
    List<TableRow> allRows =
        datasetService.getAllRows(
            table.getTableReference().getProjectId(),
            table.getTableReference().getDatasetId(),
            table.getTableReference().getTableId());
    if (!allRows.isEmpty()) {
      return false;
    }
  }
  return true;
}
 
Example 2
Source File: SocialStatsPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createSocialStatsPipeline(IndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateSocialStatsPipelineOptions(options);
	
	Pipeline pipeline = Pipeline.create(options);

	PCollection<WebresourceSocialCount> readCounts = null;
		
	String query = IndexerPipelineUtils.buildJdbcSourceImportQueryForSocialStats(options);
	
	readCounts = pipeline.apply (
           JdbcIO.<WebresourceSocialCount>read()
               .withDataSourceConfiguration(
               	JdbcIO.DataSourceConfiguration.create(options.getJdbcDriverClassName(), options.getJdbcSourceUrl())
               	.withUsername(options.getJdbcSourceUsername())
			    .withPassword(options.getJdbcSourcePassword())
               )
               .withQuery(query)
               .withRowMapper(new RowMapper<WebresourceSocialCount>() {
               	@Override
               	public WebresourceSocialCount mapRow(ResultSet resultSet) throws Exception {
               		WebresourceSocialCount result = new WebresourceSocialCount(
               				resultSet.getLong("page_pub_time")*1000L,
               				resultSet.getString("url"),
               				resultSet.getString("doc_col_id"),
               				resultSet.getString("col_item_id"),
               				resultSet.getLong("count_time")*1000L,
               				resultSet.getInt("count_tw"),
               				resultSet.getInt("count_fb")
               		); 
               	  
               		return result;
               	}
               })
               .withCoder(AvroCoder.of(WebresourceSocialCount.class))
    );
	
	// if content is to be added to bigquery, then obtain a list of
	// latest social stats per page
	PCollection<WebresourceSocialCount> countsToProcess = null;
	
	if (options.getWriteTruncate() != null && !options.getWriteTruncate() && options.getWrSocialCountHistoryWindowSec() != null) {
		String queryCache = IndexerPipelineUtils.buildBigQueryProcessedSocialCountsQuery(options);
		PCollection<KV<String,Long>> lastCountTimes = pipeline
			.apply("Get processed social count times", BigQueryIO.read().fromQuery(queryCache))
			.apply(ParDo.of(new GetLastCountTime()));

		final PCollectionView<Map<String,Long>> lastCountTimesSideInput =
				lastCountTimes.apply(View.<String,Long>asMap());
		  
		countsToProcess = readCounts
			.apply(ParDo
					.of(new DoFn<WebresourceSocialCount, WebresourceSocialCount>() {
						@ProcessElement
						public void processElement(ProcessContext c) {
							WebresourceSocialCount i = c.element();
							// check in the map if we already processed this Url, and if we haven't, add the input content to 
							// the list that needs to be processed 
							Long lastTime = c.sideInput(lastCountTimesSideInput).get(i.webResourceHash);
							
							if (lastTime == null || lastTime < i.countTime)
								c.output(i);
						}
					})
					.withSideInputs(lastCountTimesSideInput)
					);
	} else {
		countsToProcess = readCounts;
	}

	PCollection<TableRow> wrSocialCounts = countsToProcess
		.apply(ParDo.of(new CreateWrSocialCountTableRowFn()));
	
	// Now write to BigQuery
	WriteDisposition dispo = options.getWriteTruncate() ? 
			WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
	wrSocialCounts
		.apply("Write to wrsocialcount", BigQueryIO
			.writeTableRows()
			.to(getWRSocialCountTableReference(options))
			.withSchema(getWrSocialCountSchema())
			.withWriteDisposition(dispo)); 
	

	return pipeline;
}
 
Example 3
Source File: OpinionAnalysisPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createNLPPipeline(IndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
	Pipeline pipeline = Pipeline.create(options);
	
	PCollection<InputContent> readContent;
	PCollection<String> rawInput;
	
	if (options.isStreaming()) {
		
		// Continuously read from a Pub/Sub topic
		rawInput = pipeline.apply("Read from PubSub", 
			PubsubIO.readStrings().fromTopic(
				options.getPubsubTopic())); 
		
	
	} else {
		// Read from GCS files

		rawInput = pipeline.apply("Read from GCS files", 
			Read.from(new RecordFileSource<String>(
				ValueProvider.StaticValueProvider.of(options.getInputFile()), 
				StringUtf8Coder.of(), 
				RecordFileSource.DEFAULT_RECORD_SEPARATOR)));
	}

	readContent = rawInput.apply(ParDo.of(new ParseRawInput()));
	
	// Extract opinions from online opinions
	PCollection<ContentIndexSummary> indexes = readContent
		.apply(ParDo.of(new IndexDocument())) 
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	

	// Write into BigQuery 
	PCollectionTuple bqrows= indexes
		.apply(ParDo.of(new CreateTableRowsFromIndexSummaryFn())
			.withOutputTags(webresourceTag, // main output collection
				TupleTagList.of(documentTag).and(sentimentTag)) // 2 side output collections
			); 
	
	PCollection<TableRow> webresourceRows = bqrows.get(webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(sentimentTag);

	// Append or Overwrite
	WriteDisposition dispo = options.getWriteTruncate() ? 
			WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
		
	webresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourceTableReference(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentTableReference(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentTableReference(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));

	
	return pipeline;
}
 
Example 4
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * @param bqrows
 * @param webresourceRowsUnindexed
 * @param webresourceDeduped
 * @param options
 */
private static void writeAllTablesToBigQuery(PCollectionTuple bqrows,
		PCollection<TableRow> webresourceRowsUnindexed, PCollection<TableRow> webresourceDeduped,
		IndexerPipelineOptions options) {
	PCollection<TableRow> webresourceRows = bqrows.get(PipelineTags.webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(PipelineTags.documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(PipelineTags.sentimentTag);

	// Now write to BigQuery
	WriteDisposition dispo = options.getWriteTruncate() ? 
		WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
	//Merge all collections with WebResource table records
	PCollectionList<TableRow> webresourceRowsList = (webresourceDeduped == null) ?
		PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed) :
		PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed).and(webresourceDeduped);
			
	PCollection<TableRow> allWebresourceRows = 
		webresourceRowsList.apply(Flatten.<TableRow>pCollections());
			
	allWebresourceRows = !options.isStreaming() ? 
		allWebresourceRows.apply("Reshuffle Webresources", new Reshuffle<TableRow>()) : 
		allWebresourceRows;
	
	allWebresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourcePartitionedTableRef(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows = !options.isStreaming() ?
		documentRows.apply("Reshuffle Documents", new Reshuffle<TableRow>()):
		documentRows;
			
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentPartitionedTableRef(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows = !options.isStreaming() ?
		sentimentRows.apply("Reshuffle Sentiments", new Reshuffle<TableRow>()):
		sentimentRows;
			
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentPartitionedTableRef(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));
}
 
Example 5
Source File: WriteTables.java    From beam with Apache License 2.0 4 votes vote down vote up
@ProcessElement
public void processElement(ProcessContext c, BoundedWindow window) throws Exception {
  dynamicDestinations.setSideInputAccessorFromProcessContext(c);
  DestinationT destination = c.element().getKey().getKey();
  TableSchema tableSchema;
  if (firstPaneCreateDisposition == CreateDisposition.CREATE_NEVER) {
    tableSchema = null;
  } else if (jsonSchemas.containsKey(destination)) {
    tableSchema =
        BigQueryHelpers.fromJsonString(jsonSchemas.get(destination), TableSchema.class);
  } else {
    tableSchema = dynamicDestinations.getSchema(destination);
    checkArgument(
        tableSchema != null,
        "Unless create disposition is %s, a schema must be specified, i.e. "
            + "DynamicDestinations.getSchema() may not return null. "
            + "However, create disposition is %s, and %s returned null for destination %s",
        CreateDisposition.CREATE_NEVER,
        firstPaneCreateDisposition,
        dynamicDestinations,
        destination);
    jsonSchemas.put(destination, BigQueryHelpers.toJsonString(tableSchema));
  }

  TableDestination tableDestination = dynamicDestinations.getTable(destination);
  checkArgument(
      tableDestination != null,
      "DynamicDestinations.getTable() may not return null, "
          + "but %s returned null for destination %s",
      dynamicDestinations,
      destination);
  boolean destinationCoderSupportsClustering =
      !(dynamicDestinations.getDestinationCoder() instanceof TableDestinationCoderV2);
  checkArgument(
      tableDestination.getClustering() == null || destinationCoderSupportsClustering,
      "DynamicDestinations.getTable() may only return destinations with clustering configured"
          + " if a destination coder is supplied that supports clustering, but %s is configured"
          + " to use TableDestinationCoderV2. Set withClustering() on BigQueryIO.write() and, "
          + " if you provided a custom DynamicDestinations instance, override"
          + " getDestinationCoder() to return TableDestinationCoderV3.",
      dynamicDestinations);
  TableReference tableReference = tableDestination.getTableReference();
  if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
    tableReference.setProjectId(c.getPipelineOptions().as(BigQueryOptions.class).getProject());
    tableDestination = tableDestination.withTableReference(tableReference);
  }

  Integer partition = c.element().getKey().getShardNumber();
  List<String> partitionFiles = Lists.newArrayList(c.element().getValue());
  String jobIdPrefix =
      BigQueryHelpers.createJobId(
          c.sideInput(loadJobIdPrefixView), tableDestination, partition, c.pane().getIndex());

  if (tempTable) {
    // This is a temp table. Create a new one for each partition and each pane.
    tableReference.setTableId(jobIdPrefix);
  }

  WriteDisposition writeDisposition = firstPaneWriteDisposition;
  CreateDisposition createDisposition = firstPaneCreateDisposition;
  if (c.pane().getIndex() > 0 && !tempTable) {
    // If writing directly to the destination, then the table is created on the first write
    // and we should change the disposition for subsequent writes.
    writeDisposition = WriteDisposition.WRITE_APPEND;
    createDisposition = CreateDisposition.CREATE_NEVER;
  } else if (tempTable) {
    // In this case, we are writing to a temp table and always need to create it.
    // WRITE_TRUNCATE is set so that we properly handle retries of this pane.
    writeDisposition = WriteDisposition.WRITE_TRUNCATE;
    createDisposition = CreateDisposition.CREATE_IF_NEEDED;
  }

  BigQueryHelpers.PendingJob retryJob =
      startLoad(
          bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
          bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
          jobIdPrefix,
          tableReference,
          tableDestination.getTimePartitioning(),
          tableDestination.getClustering(),
          tableSchema,
          partitionFiles,
          writeDisposition,
          createDisposition,
          schemaUpdateOptions);
  pendingJobs.add(
      new PendingJobData(window, retryJob, partitionFiles, tableDestination, tableReference));
}