java source code of BigQueryWordCountToBigQuery

/*
 * Copyright 2017 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 
package spark.bigquery.example.wordcount;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration;
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat;
import com.google.cloud.hadoop.io.bigquery.GsonBigQueryInputFormat;
import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration;
import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat;
import com.google.cloud.hadoop.util.EntriesCredentialConfiguration;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.gson.JsonObject;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;

import scala.Tuple2;

import java.io.IOException;

/**
 * A simple work count example that reads its input data from a BigQuery table, e.g.,
 * publicdata:samples.shakespeare and writes its output to a user-specified BigQuery table. GCS is
 * used as temporary storage for both data exported from the input BigQuery table and output data to
 * be loaded into the output BigQuery table.
 */
public class BigQueryWordCountToBigQuery {

  private static final String FS_GS_PROJECT_ID = "fs.gs.project.id";
  private static final String FS_GS_SYSTEM_BUCKET = "fs.gs.system.bucket";
  private static final String FS_GS_WORKING_DIR = "fs.gs.working.dir";
  private static final String MAPRED_OUTPUT_FORMAT_CLASS = "mapreduce.job.outputformat.class";

  private static final String WORD_COLUMN = "word";
  private static final String WORD_COUNT_COLUMN = "word_count";
  private static final TableSchema OUTPUT_TABLE_SCHEMA = new TableSchema().setFields(
      ImmutableList.of(
          new TableFieldSchema().setName(WORD_COLUMN).setType("STRING"),
          new TableFieldSchema().setName(WORD_COUNT_COLUMN).setType("INTEGER")
      ));

  public static void main(String[] args) throws IOException {
    if (args.length != 2) {
      System.err.println("Usage: BigQueryWordCountToBigQuery <fully-qualified input table id> " +
          "<fully-qualified output table id>");
      System.exit(1);
    }

    Configuration conf = null;
    Path tmpDirPath = null;
    try (JavaSparkContext javaSparkContext = JavaSparkContext
        .fromSparkContext(SparkContext.getOrCreate())) {
      conf = configure(javaSparkContext.hadoopConfiguration(), args);
      tmpDirPath = new Path(conf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY)).getParent();
      deleteTmpDir(tmpDirPath, conf);
      compute(javaSparkContext, conf);
    } finally {
      if (conf != null && tmpDirPath != null) {
        deleteTmpDir(tmpDirPath, conf);
      }
    }
  }

  private static Configuration configure(Configuration conf, String[] args) throws IOException {
    String inputTableId = args[0];
    Preconditions.checkArgument(!Strings.isNullOrEmpty(inputTableId),
        "Input BigQuery table (fully-qualified) ID must not be null or empty");
    String outputTableId = args[1];
    Preconditions.checkArgument(!Strings.isNullOrEmpty(inputTableId),
        "Output BigQuery table (fully-qualified) ID must not be null or empty");
    String projectId = conf.get(FS_GS_PROJECT_ID);
    Preconditions.checkArgument(!Strings.isNullOrEmpty(projectId),
        "GCP project ID must not be null or empty");
    String systemBucket = conf.get(FS_GS_SYSTEM_BUCKET);
    Preconditions.checkArgument(!Strings.isNullOrEmpty(systemBucket),
        "GCS system bucket must not be null or empty");

    // Basic BigQuery connector configuration.
    conf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId);
    conf.set(BigQueryConfiguration.GCS_BUCKET_KEY, systemBucket);
    conf.set(FS_GS_WORKING_DIR, "/");

    // Use service account for authentication. The service account key file is located at the path
    // specified by the configuration property google.cloud.auth.service.account.json.keyfile.
    conf.set(EntriesCredentialConfiguration.BASE_KEY_PREFIX +
            EntriesCredentialConfiguration.ENABLE_SERVICE_ACCOUNTS_SUFFIX,
        "true");

    // Input configuration.
    BigQueryConfiguration.configureBigQueryInput(conf, inputTableId);
    String inputTmpGcsPath = String
        .format("gs://%s/spark/tmp/bigquery/wordcount/input", systemBucket);
    conf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, inputTmpGcsPath);

    // Output configuration.
    String outputTmpGcsPath = String
        .format("gs://%s/spark/tmp/bigquery/wordcount/output", systemBucket);
    BigQueryOutputConfiguration.configure(
        conf,
        outputTableId,
        OUTPUT_TABLE_SCHEMA,
        outputTmpGcsPath,
        BigQueryFileFormat.NEWLINE_DELIMITED_JSON,
        TextOutputFormat.class);
    conf.set(MAPRED_OUTPUT_FORMAT_CLASS, IndirectBigQueryOutputFormat.class.getName());

    return conf;
  }

  private static void compute(JavaSparkContext javaSparkContext, Configuration conf) {
    JavaPairRDD<LongWritable, JsonObject> tableData = javaSparkContext.newAPIHadoopRDD(
        conf,
        GsonBigQueryInputFormat.class,
        LongWritable.class,
        JsonObject.class);
    JavaPairRDD<String, Long> wordCounts = tableData
        .map(entry -> toTuple(entry._2))
        .keyBy(tuple -> tuple._1)
        .mapValues(tuple -> tuple._2)
        .reduceByKey((count1, count2) -> count1 + count2);
    wordCounts
        .map(tuple -> new Text(toJson(tuple).toString()))
        .keyBy(jsonText -> jsonText)
        .mapValues(jsonText -> NullWritable.get()) // Values do not matter.
        .saveAsNewAPIHadoopDataset(conf);
  }

  private static void deleteTmpDir(Path tmpDirPath, Configuration conf) throws IOException {
    FileSystem fs = tmpDirPath.getFileSystem(conf);
    if (fs.exists(tmpDirPath) && !fs.delete(tmpDirPath, true)) {
      System.err.println("Failed to delete temporary directory: " + tmpDirPath);
    }
  }

  private static Tuple2<String, Long> toTuple(JsonObject jsonObject) {
    String word = jsonObject.get(WORD_COLUMN).getAsString().toLowerCase();
    long count = jsonObject.get(WORD_COUNT_COLUMN).getAsLong();
    return new Tuple2<>(word, count);
  }

  private static JsonObject toJson(Tuple2<String, Long> tuple) {
    JsonObject jsonObject = new JsonObject();
    jsonObject.addProperty(WORD_COLUMN, tuple._1);
    jsonObject.addProperty(WORD_COUNT_COLUMN, tuple._2);
    return jsonObject;
  }
}