java source code of MusicBrainzTransforms

/*
 * Copyright 2019 Google LLC
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     https://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.cloud.bqetl.mbdata;


import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.bqetl.json.JSONReader;
import com.google.cloud.bqetl.mbschema.FieldSchemaListBuilder;
import com.google.cloud.bqetl.options.BQETLOptions;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.transforms.ParDo.SingleOutput;
import org.apache.beam.sdk.transforms.join.CoGbkResult;
import org.apache.beam.sdk.transforms.join.CoGroupByKey;
import org.apache.beam.sdk.transforms.join.KeyedPCollectionTuple;
import org.apache.beam.sdk.values.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.stream.Collectors;
import java.util.AbstractMap.SimpleEntry;


public class MusicBrainzTransforms {
/**
 *  This is a library of reusable transforms for the musicbrainz dataset.
 */
  private static int BIGQUERY_NESTING_LIMIT = 1000;
  private static final Logger logger = LoggerFactory.getLogger(MusicBrainzTransforms.class);

  /**
   * Given a PCollection of MusicBrainzDataObject's turn into to a keyed collection keyed by the given key.
   *
   * @param name  - name of the column value to use as the key
   * @param input - the PCollection of MusicBrainzDataObject's
   *              note that the column value is assumed to be a Long
   */
  public static PCollection<KV<Long, MusicBrainzDataObject>> by(String name, PCollection<MusicBrainzDataObject> input) {
    return input.apply("by " + name, MapElements
      .into(new TypeDescriptor<KV<Long, MusicBrainzDataObject>>() {}) 
      .via( (MusicBrainzDataObject inputObject) -> {
            try {
              return KV.of((Long) inputObject.getColumnValue(name), inputObject);
            } catch (Exception e) {
              logger.error(" exception in by " + name, e);
              return null;
            }
          }

       )
    );

  }

  private static PCollection<KV<Long, CoGbkResult>> group(String name,
                                                          PCollection<KV<Long, MusicBrainzDataObject>> first,
                                                          PCollection<KV<Long, MusicBrainzDataObject>> second,
                                                          TupleTag<MusicBrainzDataObject> firstTag,
                                                          TupleTag<MusicBrainzDataObject> secondTag
  ) {
    final CoGroupByKey<Long> grouper = CoGroupByKey.create();


    PCollection<KV<Long, CoGbkResult>> joinedResult;

    try {
      joinedResult = KeyedPCollectionTuple
                      .of(firstTag, first)
                      .and(secondTag, second)
                      .apply("joinResult_"+name,
                             CoGroupByKey.<Long>create());
    } catch (Exception e) {
      logger.error("exception grouping.", e);
      return null;
    }
    return joinedResult;
  }

  /**
   * Perform an inner join of two keyed PCollections of MusicBrainzDataObjects and merge the results into a
   * list of MusicBrainzDataObjects.
   *
   * @param table1 - Keyed PCollection of MusicBrainzDataObject's
   * @param table2 - Keyed PCollection of MusicBrainzDataObject's
   */
  // [START innerJoin]
  public static PCollection<MusicBrainzDataObject> 
                       innerJoin(String name, 
                                 PCollection<KV<Long, MusicBrainzDataObject>> table1, 
                                 PCollection<KV<Long, MusicBrainzDataObject>> table2) {
    final TupleTag<MusicBrainzDataObject> t1 = new TupleTag<MusicBrainzDataObject>(){};
    final TupleTag<MusicBrainzDataObject> t2 = new TupleTag<MusicBrainzDataObject>(){};
    PCollection<KV<Long, CoGbkResult>> joinedResult = group(name, table1, table2, t1, t2);
    // [END innerJoin]
    // [START mergeJoinResults]
    PCollection<List<MusicBrainzDataObject>> mergedResult = 
        joinedResult.apply("merge join results", 
                     MapElements
                   .into(new TypeDescriptor<List<MusicBrainzDataObject>>() {}) 
                   .via( ( KV<Long, CoGbkResult> group ) -> {
                       List<MusicBrainzDataObject> result = new ArrayList<MusicBrainzDataObject>();
                       Iterable<MusicBrainzDataObject> leftObjects = group.getValue().getAll(t1);
                       Iterable<MusicBrainzDataObject> rightObjects = group.getValue().getAll(t2);
                       leftObjects.forEach((MusicBrainzDataObject l) -> {
                         rightObjects.forEach((MusicBrainzDataObject r) -> {
                           result.add(l.duplicate().merge(r));
                         });
                       });
                       return result;
                     }
                   )
);
// [END mergeJoinResults]
// [START flattenMergedResults]
    return mergedResult.apply("Flatten List to Objects", Flatten.<MusicBrainzDataObject>iterables());
// [END flattenMergedResults]
  }


  /**
   * Given a parent PCollection with a known value for a key,
   * nest a given child collection based on it's key value within elements of the first collection.
   *
   * @param parent - Keyed PCollection of Parent MusicBrainzDataObject's
   * @param child  - Keyed PCollection of Child MusicBrainzDataObject's
   */
  // [START nestTransform]
  public static PCollection<MusicBrainzDataObject> nest(PCollection<KV<Long, MusicBrainzDataObject>> parent,
                                                        PCollection<KV<Long, MusicBrainzDataObject>> child,
                                                        String nestingKey) {
    final TupleTag<MusicBrainzDataObject> parentTag = new TupleTag<MusicBrainzDataObject>(){};
    final TupleTag<MusicBrainzDataObject> childTag = new TupleTag<MusicBrainzDataObject>(){};

    PCollection<KV<Long, CoGbkResult>> joinedResult = group("nest " + nestingKey, parent, child, parentTag, childTag);
    return joinedResult.apply("merge join results " + nestingKey, 
                              MapElements
                               .into(new TypeDescriptor<MusicBrainzDataObject>() {})
                               .via((KV<Long, CoGbkResult> group) -> {
                                  MusicBrainzDataObject parentObject = group.getValue().getOnly(parentTag);
                                  Iterable<MusicBrainzDataObject> children = group.getValue().getAll(childTag);
                                  List<MusicBrainzDataObject> childList = new ArrayList<MusicBrainzDataObject>();
                                  children.forEach(childList::add);
                                  parentObject = parentObject.duplicate();
                                  parentObject.addColumnValue("recordings", childList);
                                  return parentObject;
                                })
                           );
  }
  // [END nestTransform]

  /*
   * Create a simple serializable version of the TableSchema useable across worker nodes.
   */
  private static Map<String, Object> serializeableTableSchema(TableSchema schema) {
    return serializeableTableSchema(null, schema.getFields());
  }
  /*
   * recursable method for serializable schema
   */
  private static Map<String, Object> serializeableTableSchema(Map<String, Object> current, List<TableFieldSchema> fields) {
    if (current == null) {
      current = new HashMap<String, Object>();
    }
    for (TableFieldSchema field : fields) {
      if (field.getType().equals(FieldSchemaListBuilder.RECORD)) {
        current.put(field.getName(), serializeableTableSchema(null, field.getFields()));
      } else {
        current.put(field.getName(), field.getType());
      }
    }
    return current;
  }

  /**
   * Given a set of MusicBrainzDataObject's representing table rows and a TableSchema that has keynames that match with
   * the PCollection of MusicBrainzDataObjects, execute a transform to transform those objects in to BigQuery table
   * rows. Only use the fields of the data objects found in the Table schema.
   *
   * @param objects - the PCollection of data objects to transform into TableRows
   * @param schema  - the table schema to use
   */

  public static PCollection<TableRow> transformToTableRows(PCollection<MusicBrainzDataObject> objects, TableSchema schema) {
    Map<String, Object> serializableSchema = serializeableTableSchema(schema);
    return objects.apply("Big Query TableRow Transform",
                         MapElements
                          .into(new TypeDescriptor<List<TableRow>>() {})
                          .via((MusicBrainzDataObject inputObject) -> {
                                  List<TableRow> rows = toTableRows(inputObject, serializableSchema);
                                  return rows;
                               }))
                   .apply("Flatten TableRow List to TableRows", Flatten.<TableRow>iterables());
   }

 
  /**
   * This converts a single MusicBrainzDataObject into a list of one or more TableRows, nesting as necessary.
   * It uses the BIGQUERY_NESTING_LIMIT to duplicate rows and continue adding nested records to their duplicate.
   * For example if a MusicBrainzDataObject has a child list of BIGQUERY_NESTING_LIMIT + 1 nested objects the
   * result will be a list containing two table rows 1 with BIGQUERY_NESTINGLIMIT children and a duplicate with
   * one child.
   */
  // [START toTableRows]
  private static List<TableRow> toTableRows(MusicBrainzDataObject mbdo, Map<String, Object> serializableSchema) {
    TableRow row = new TableRow();
    List<TableRow> result = new ArrayList<TableRow>();
    Map<String, List<MusicBrainzDataObject>> nestedLists = new HashMap<String, List<MusicBrainzDataObject>>();
    Set<String> keySet = serializableSchema.keySet();
    /*
     *  construct a row object without the nested objects
     */
    int maxListSize = 0;
    for (String key : keySet) {
      Object value = serializableSchema.get(key);
      Object fieldValue = mbdo.getColumnValue(key);
      if (fieldValue != null) {
        if (value instanceof Map) {
          List<MusicBrainzDataObject> list = (List<MusicBrainzDataObject>) fieldValue;
          if (list.size() > maxListSize) {
            maxListSize = list.size();
          }
          nestedLists.put(key, list);
        } else {
          row.set(key, fieldValue);
        }

      }
    }
    /*
     * add the nested objects but break up the nested objects across duplicate rows if nesting limit exceeded
     */
    TableRow parent = row.clone();
    Set<String> listFields = nestedLists.keySet();
    for (int i = 0; i < maxListSize; i++) {
      parent = (parent == null ? row.clone() : parent);
      final TableRow parentRow = parent;
      nestedLists.forEach((String key, List<MusicBrainzDataObject> nestedList) -> {
        if (nestedList.size() > 0) {
          if (parentRow.get(key) == null) {
            parentRow.set(key, new ArrayList<TableRow>());
          }
          List<TableRow> childRows = (List<TableRow>) parentRow.get(key);
          childRows.add(toChildRow(nestedList.remove(0), (Map<String, Object>) serializableSchema.get(key)));
        }
      });
      if ((i > 0) && (i % BIGQUERY_NESTING_LIMIT == 0)) {
        result.add(parent);
        parent = null;
      }
    }
    if (parent != null) {
      result.add(parent);
    }
    return result;
  }
  // [END toTableRows]

  /**
   * A child row cannot have any nested repeated records.
   * This turns a MusicBrainzDataObject into a child row that is presumably nested inside a parent.
   */
  private static TableRow toChildRow(MusicBrainzDataObject object, Map<String, Object> childSchema) {
    TableRow row = new TableRow();
    childSchema.forEach((String key, Object value) -> {
      row.set(key, object.getColumnValue(key));
    });
    return row;
  }


  /**
   * Given the cloud storage object containing a line delimited json file,
   * a pipeline and a keyname load MusicBrainzDataObject's into a keyed PCollection
   * with the key being the column value for keyName and applies the supplied mappings.
   *
   * @param p       Pipeline object to use to load the data objects
   * @param name    the name of the google cloud storage object
   * @param keyName the name of the column to use as the key for this PCollection<KV<Long,MusicBrainzDataObject>
   *                Note that this key assumed to be a Long.
   * @param mappers variable sized list of lookup descriptions.
   *                mappers map a Long integer to a String
   * @return PCollection<KV<Long,MusicBrainzDataObject>>
   */
  public static PCollection<KV<Long, MusicBrainzDataObject>> loadTable(Pipeline p, String name, String keyName,
                                                                       LookupDescription... mappers) {
    PCollection<String> text = loadText(p, name);
    return loadTableFromText(text, name, keyName, mappers);
  }

  /**
   * Given the cloud storage object containing a line delimited json file,
   * a pipeline and a keyname load MusicBrainzDataObjects into a keyed PCollection
   * with the key being the column value for keyName
   *
   * @param p       Pipeline object to use to load the data objects
   * @param name    the name of the google cloud storage object
   * @param keyName the name of the column to use as the key for this PCollection<KV<Long,MusicBrainzDataObject>
   *                Note that this key assumed to be a Long.
   * @return PCollection<KV<Long,MusicBrainzDataObject>>
   */
  public static PCollection<KV<Long, MusicBrainzDataObject>> loadTable(Pipeline p, String name, String keyName) {
    PCollection<String> text = loadText(p, name);
    return loadTableFromText(text, name, keyName);
  }

  /**
   * Given the cloud storage object containing a line delimited json file,
   * and a pipeline that has the BQETLOptions set, load the MusicBrainzDataObjects into a PCollection.
   *
   * @param p    Pipeline object to use for the load
   * @param name name of the google cloud storage object
   * @return PCollection<MusicBrainzDataObject>
   */
  public static PCollection<MusicBrainzDataObject> loadTable(Pipeline p, String name) {
    PCollection<String> text = loadText(p, name);
    return loadTableFromText(text, name);
  }


  /**
   * Given a PCollection of String's each representing an MusicBrainzDataObject transform those strings into KV<Long,MusicBrainzDataObject> where the
   * name is the namespace of the data object key is the value of the object's keyName property.
   *
   * @param text    - the PCollection of strings
   * @param name    - the namespace for the data objects (or row name)
   * @param keyName - the key to use as the key in the KV object.
   * @return
   */
  // [START loadTableByValue]
  public static PCollection<KV<Long, MusicBrainzDataObject>> loadTableFromText(PCollection<String> text, String name, String keyName) {
    final String namespacedKeyname = name + "_" + keyName;
    return text.apply("load " + name, 
                      MapElements
                        .into(new TypeDescriptor<KV<Long, MusicBrainzDataObject>>() {})
                        .via( (String input) -> {
                              MusicBrainzDataObject datum = JSONReader.readObject(name, input);
                              Long key = (Long) datum.getColumnValue(namespacedKeyname);
                              return KV.of(key, datum);
                              })
           );
  }
  // [END loadTableByValue]

  /**
   * Given a PCollection of String's each representing an MusicBrainzDataObject transform those strings into
   * MusicBrainzDataObject's where the namespace for the MusicBrainzDataObject is 'name'
   *
   * @param text the json string representing the MusicBrainzDataObject
   * @param name the namespace for hte MusicBrainzDataObject
   * @return PCollection of MusicBrainzDataObjects
   */
  public static PCollection<MusicBrainzDataObject> loadTableFromText(PCollection<String> text, String name) {
    return text.apply("load : " + name, 
                      MapElements
                         .into(new TypeDescriptor<MusicBrainzDataObject>() {})
                         .via((String input) -> JSONReader.readObject(name, input))
                     );
  }

  /**
   * Given a PCollection of Strings each with json containing a mapping from a Long to a String create a
   * Singleton PCollection with the mapping.
   * Example mapping:
   * {
   * <p>
   * .... <br/>
   * "id" : 38, <br/>
   * ....
   * "name": "Canada <br/>
   * .... <br/>
   * </p>
   * }
   * In this case keyKey is id and valueKey is name.
   *
   * @param text     - json string containing the mapping
   * @param keyKey   - the json key for the value that will serve as the key for the mapping
   * @param valueKey - the json key for the value that will serve as the value for the mapping
   */
  // [START lookupTableWithSideInputs1]
  public static PCollectionView<Map<Long, String>> loadMapFromText(PCollection<String> text, String name, String keyKey, String valueKey) {
    // column/Key names are namespaced in MusicBrainzDataObject
    String keyKeyName = name + "_" + keyKey;
    String valueKeyName = name + "_" + valueKey;

    PCollection<KV<Long, String>> entries = text.apply(
          "sideInput_" + name,
          MapElements
            .into(new TypeDescriptor<KV<Long, String>>() {})
            .via((String input) -> {
                   MusicBrainzDataObject object = JSONReader.readObject(name, input);
                   Long key = (Long) object.getColumnValue(keyKeyName);

                   String value = (String) object.getColumnValue(valueKeyName);
                   return KV.of(key, value);
                 })
          );

    return entries.apply(View.<Long, String>asMap());
  }
  // [END lookupTableWithSideInputs1]

  /**
   * Given a PCollection of String's each representing an MusicBrainzDataObject transform those strings into
   * MusicBrainzDataObject's where the name space for the MusicBrainzDataObject is 'name'
   *
   * @param text    the json string representing the MusicBrainzDataObject
   * @param name    the namespace for hte MusicBrainzDataObject
   * @param mappers variable number of lookup descriptions - lookup descriptions can be created using the
   *                factory method lookup();
   * @return PCollection of MusicBrainzDataObjects
   */

  public static PCollection<KV<Long, MusicBrainzDataObject>> loadTableFromText(PCollection<String> text, String name,
                                                                               String keyName,
                                                                               LookupDescription... mappers) {
    //[START lookupTableWithSideInputs2]
    List<SimpleEntry<ArrayList<String>, PCollectionView<Map<Long, String>>>> mapSideInputs = new ArrayList<SimpleEntry<ArrayList<String>, PCollectionView<Map<Long, String>>>>();

    for (LookupDescription mapper : mappers) {
      PCollectionView<Map<Long, String>> mapView = loadMap(text.getPipeline(), mapper.objectName, mapper.keyKey, mapper.valueKey);
      List<String> destKeyList = 
          mapper.destinationKeys.stream()
                                .map( destinationKey -> name + "_" + destinationKey )
                                .collect(Collectors.toList());

        mapSideInputs.add(new SimpleEntry(destKeyList, mapView));

    }
    //[END lookupTableWithSideInputs2]
    return loadTableFromText(text, name, keyName, mapSideInputs);
  }



  static PCollection<KV<Long, MusicBrainzDataObject>> loadTableFromText(PCollection<String> text, String name,
                                                                               String keyName,
                                                                               List<SimpleEntry<ArrayList<String>, PCollectionView<Map<Long, String>>>> sideMappings ) {
                                                                              // List<PCollectionView<Map<Long, String>>> sideMappings) {

    final String namespacedKeyname = name + "_" + keyName;

    return text.apply(
        "load with SideMappings",
         ParDo
             .of(new DoFn<String, KV<Long, MusicBrainzDataObject>>() {

      @ProcessElement
      //public void processElement(ProcessContext processContext) throws Exception {
      public void processElement(@Element String input, OutputReceiver<KV<Long,MusicBrainzDataObject>> out, ProcessContext c) {

        MusicBrainzDataObject result = JSONReader.readObject(name, input);


        sideMappings.forEach((SimpleEntry<ArrayList<String>, PCollectionView<Map<Long, String>>> mapping) -> {
          //[START lookupTableWithSideInputs3]
          Map<Long, String> sideInputMap = c.sideInput(mapping.getValue());

          List<String> keyList = mapping.getKey();

          keyList.forEach( ( String key ) -> {
            Long id = (Long) result.getColumnValue(key);
            if (id != null) {
              String label = (String) sideInputMap.get(id);
              if (label == null) {
                label = "" + id;
              }
              result.replace(key, label);
              //[END lookupTableWithSideInputs3]
            }
          });
        });

        Long key = (Long) result.getColumnValue(namespacedKeyname);

        out.output(KV.of(key, result));
      }
    }).withSideInputs(sideMappings.stream()
                                  .map( x -> x.getValue() )
                                  .collect(Collectors.toList()))); 
  }

  /**
   * Given the cloud storage object containing a line delimited json file from which a map should be
   * defined by:
   * value of json key (keyKey) -> value of other json key (valueKey)
   * (maps should be relatively small as they are generally used as side inputs for adding a mapping from a
   * non-user-friendly ID to a user-friendly name or description.
   *
   * @param name     - name
   * @param keyKey   - the name of the json key to use as the key in the resulting map.
   * @param valueKey - the name of hte json key to use as the value in the resulting map.
   */
  private static PCollectionView<Map<Long, String>> loadMap(Pipeline p, String name, String keyKey, String valueKey) {
    PCollection<String> text = loadText(p, name);
    return loadMapFromText(text, name, keyKey, valueKey);
  }

  /**
   * Given a pipeline, object name load a json line delimited file into a PCollection of strings
   * each representing a line of JSON.
   *
   * @param p
   * @param name
   * @return
   */
  // [START loadStrings]
  public static PCollection<String> loadText(Pipeline p, String name) {
    BQETLOptions options = (BQETLOptions) p.getOptions();
    String loadingBucket = options.getLoadingBucketURL();
    String objectToLoad = storedObjectName(loadingBucket, name);
    return p.apply(name, TextIO.read().from(objectToLoad));
  }
  // [END loadStrings]

  /**
   * Derive a storedObject name from the loading bucket and the name of the object.
   *
   * @param loadingBucket name of bucket
   * @param name          - name of the object
   * @return
   */
  private static String storedObjectName(String loadingBucket, String name) {
    StringBuilder builder = new StringBuilder();
    builder.append(loadingBucket).append("/").append(name).append(".json");
    return builder.toString();
  }

  /**
   * Factory method for creating a lookup table description
   * a lookup usually maps a primary key to a label.
   * Note that in this demonstration maps are assumed to be of the form Long -> String.
   *
   * @param objectName      - the storage object name
   * @param keyKey          - the keyname in the mapping table that will match the value in the target object to be
   *                        replaced (e.g. id)
   * @param valueKey        - the keyname in the mapping table that contains the value
   * @param destinationKeys - the key to replace in the target object
   */
  // [START lookupMethod]
  public static LookupDescription lookup(String objectName, String keyKey, String valueKey, String... destinationKeys) {
    return new LookupDescription(objectName, keyKey, valueKey, destinationKeys);
  }
  // [END lookupMethod]

  /**
   * Simple class for encapsulating the description of a lookup.
   * Created through factory method lookup().
   */
  public static class LookupDescription {
    String objectName;
    List<String> destinationKeys;
    String keyKey;
    String valueKey;


    LookupDescription(String objectName, String keyKey, String valueKey, String... destinationKeys) {
      this.objectName = objectName;
      this.destinationKeys = Arrays.asList(destinationKeys);
      this.keyKey = keyKey;
      this.valueKey = valueKey;
    }

  }
}