java source code of AbstractValueSets

package com.cerner.bunsen.spark.codes.base;

import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.lit;

import ca.uhn.fhir.context.FhirVersionEnum;
import ca.uhn.fhir.parser.IParser;
import com.cerner.bunsen.FhirContexts;
import com.cerner.bunsen.spark.SparkRowConverter;
import com.cerner.bunsen.spark.codes.UrlAndVersion;
import com.cerner.bunsen.spark.codes.Value;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.hl7.fhir.instance.model.api.IBaseResource;
import scala.Tuple2;

/**
 * This is a partial implementation of logic to manage FHIR ValueSets. It is designed to
 * encapsulate as much functionality as possible while remaining independent of specific FHIR
 * versions. Users should generally not use this class directly, but rather consume the subclass
 * that corresponds to the FHIR version they are using.
 *
 * @param <T> the type of the FHIR ValueSet objects being used
 * @param <C> the type of the subclass of this class being used.
 */
public abstract class AbstractValueSets<T extends IBaseResource,C extends AbstractValueSets<T,C>> {

  /**
   * An encoder for serializing values.
   */
  protected static final Encoder<Value> VALUE_ENCODER = Encoders.bean(Value.class);

  protected static final Encoder<UrlAndVersion> URL_AND_VERSION_ENCODER =
      Encoders.bean(UrlAndVersion.class);

  protected static final Pattern TABLE_NAME_PATTERN =
      Pattern.compile("[A-Za-z][A-Za-z0-9_]*\\.?[A-Za-z0-9_]*");

  /**
   * Returns the encoder for UrlAndVersion tuples.
   *
   * @return the encoder for UrlAndVersion tuples.
   */
  public static Encoder<UrlAndVersion> getUrlAndVersionEncoder() {
    return URL_AND_VERSION_ENCODER;
  }

  /**
   * Returns the encoder for values.
   *
   * @return the encoder for values.
   */
  public static Encoder<Value> getValueEncoder() {
    return AbstractValueSets.VALUE_ENCODER;
  }

  /**
   * Default database name where the value sets information is stored.
   */
  public static final String VALUE_SETS_DATABASE = "ontologies";

  /**
   * Default table name where the expanded values information is stored.
   */
  public static final String VALUES_TABLE = "values";

  /**
   * Default table name where value sets metadata is stored.
   */
  public static final String VALUE_SETS_TABLE = "valuesets";


  protected final SparkSession spark;

  protected final FhirVersionEnum fhirVersion;

  /**
   * URI and Version metadata used to preserve uniqueness among value sets.
   */
  protected final Dataset<UrlAndVersion> members;

  protected final Dataset<Value> values;

  protected final Dataset<Row> valueSets;

  protected final SparkRowConverter valueSetRowConverter;

  protected AbstractValueSets(SparkSession spark,
      FhirVersionEnum fhirVersion,
      Dataset<UrlAndVersion> members,
      Dataset<Row> valueSets,
      Dataset<Value> values,
      SparkRowConverter valueSetRowConverter) {

    this.spark = spark;
    this.fhirVersion = fhirVersion;
    this.members = members;
    this.valueSets = valueSets;
    this.values = values;
    this.valueSetRowConverter = valueSetRowConverter;
  }

  /**
   * Returns the latest version of all value sets.
   *
   * @param includeExperimental whether to include value sets marked as experimental
   * @return a map of value set URIs to the latest version for them.
   */
  public Map<String,String> getLatestVersions(boolean includeExperimental) {

    return getLatestVersions(null, includeExperimental);
  }

  /**
   * Returns the latest versions of a given set of value sets.
   *
   * @param uris a set of URIs for which to retrieve the latest versions, or null to load them all
   * @param includeExperimental whether to include value sets marked as experimental
   * @return a map of value set URIs to the latest versions for them.
   */
  public Map<String,String> getLatestVersions(final Set<String> uris, boolean includeExperimental) {

    // Reduce by the concept map URI to return only the latest version
    // per concept map. Spark's provided max aggregation function
    // only works on numeric types, so we jump into RDDs and perform
    // the reduce by hand.
    JavaRDD<UrlAndVersion> members = this.valueSets.select("url", "version", "experimental")
        .toJavaRDD()
        .filter(row -> (uris == null || uris.contains(row.getString(0)))
            && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2)))
        .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1)))
        .reduceByKey((leftVersion, rightVersion) ->
            leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion)
        .map(tuple -> new UrlAndVersion(tuple._1, tuple._2));

    return spark.createDataset(members.rdd(), URL_AND_VERSION_ENCODER)
        .collectAsList()
        .stream()
        .collect(Collectors.toMap(UrlAndVersion::getUrl,
            UrlAndVersion::getVersion));
  }

  /**
   * Returns a dataset with the latest values for each valueset of the given uris.
   *
   * @param uris URIs for the value sets
   * @param includeExperimental whether to include value sets marked as experimental
   * @return a dataset of the latest mappings for them.
   */
  public Dataset<Value> getLatestValues(Set<String> uris, boolean includeExperimental) {

    // Since mappings are partitioned by URL and version, in most cases it is more efficient to load
    // separately for each partition and union the results.
    Map<String,String> latestVersions = getLatestVersions(uris, includeExperimental);

    return getValues(latestVersions);
  }

  /**
   * Returns a dataset of all values in this collection. This is generally used for inspection and
   * debugging of values.
   *
   * @return a dataset of all values.
   */
  public Dataset<Value> getValues() {
    return this.values;
  }

  /**
   * Returns the values for the given URI and version.
   *
   * @param uri the uri of the value set for which we get values
   * @param version the version of the value set for which we get values
   * @return a dataset of values for the given URI and version.
   */
  public Dataset<Value> getValues(String uri, String version) {

    return this.values.where(col("valueseturi").equalTo(lit(uri))
        .and(col("valuesetversion").equalTo(lit(version))));
  }

  /**
   * Returns a dataset with the values for each element in the map of uri to version.
   *
   * @param uriToVersion a map of value set URI to the version to load
   * @return a dataset of values for the given URIs and versions.
   */
  public Dataset<Value> getValues(Map<String,String> uriToVersion) {

    JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

    Broadcast<Map<String,String>> broadcastUrisToVersion = context.broadcast(uriToVersion);

    return this.values.filter((FilterFunction<Value>) value -> {

      String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri());

      return latestVersion != null && latestVersion.equals(value.getValueSetVersion());
    });
  }

  /**
   * Returns a dataset of value sets to inspect metadata. Since the value sets themselves can be
   * quite large, the values in this dataset do not contain them. Instead, users should use the
   * {@link #getValues()} method to query values in depth.
   *
   * @return a dataset of value sets that do not contain concept values.
   */
  public Dataset<Row> getValueSets() {
    return this.valueSets;
  }

  /**
   * Returns the value set with the given uri and version, or null if there is no such value set.
   *
   * @param uri the uri of the value set to return
   * @param version the version of the value set to return
   * @return the specified value set.
   */
  public T getValueSet(String uri, String version) {

    // Load the value sets, which may contain zero items if the value set does not exist

    // Typecast necessary to placate the Java compiler calling this Scala function
    Row[] valuesetRows = (Row[]) this.valueSets.filter(
        col("url").equalTo(lit(uri))
            .and(col("version").equalTo(lit(version))))
        .head(1);

    if (valuesetRows.length == 0) {

      return null;

    } else {

      T valueSet = (T) valueSetRowConverter.rowToResource(valuesetRows[0]);

      Dataset<Value> filteredValues = getValues(uri, version);

      addToValueSet(valueSet, filteredValues);

      return valueSet;
    }
  }

  /**
   * Adds the given values to the given value set instance.
   *
   * @param valueSet the value set to add.
   * @param values the values to add.
   */
  protected abstract void addToValueSet(T valueSet, Dataset<Value> values);

  /**
   * Returns a dataset of distinct URL and version tuples.
   *
   * @param valueSets valueSets to scan for the URL and version.
   * @return the distinct URL and versions for the value sets.
   */
  protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<Row> valueSets) {

    return valueSets.select("url", "version")
        .distinct()
        .as(URL_AND_VERSION_ENCODER);
  }

  /**
   * Returns true if the UrlAndVersions if the membersToCheck has any duplicates with the members
   * of this value sets instance.
   *
   * @param membersToCheck the members to check for duplicates
   * @return true if there are duplicate URL and versions, false otherwise.
   */
  protected boolean hasDuplicateUrlAndVersions(Dataset<UrlAndVersion> membersToCheck) {

    return this.members.intersect(membersToCheck).count() > 0;
  }

  /**
   * Returns a new ValueSets instance that includes the given value sets.
   *
   * @param valueSets the value sets to add to the returned collection.
   * @return a new ValueSets instance with the added value sets.
   */
  public abstract C withValueSets(Dataset<Row> valueSets);

  /**
   * Returns a new ValueSets instance that includes the given value sets.
   *
   * @param valueSets the value sets to add to the returned collection.
   * @return a new ValueSets instance with the added value sets.
   */
  public C withValueSets(T... valueSets) {

    return withValueSets(Arrays.asList(valueSets));
  }

  /**
   * Returns a new ValueSets instance that includes the given value sets.
   *
   * @param valueSets the value sets to add to the returned collection.
   * @return a new ValueSets instance with the added value sets.
   */
  public C withValueSets(List<T> valueSets) {

    List<Row> rows = valueSets.stream()
        .map(valueSet -> valueSetRowConverter.resourceToRow(valueSet))
        .collect(Collectors.toList());

    return withValueSets(this.spark.createDataFrame(rows,
        valueSetRowConverter.getSchema()));
  }

  /**
   * Reads all value sets from a given directory and adds them to our collection. The directory may
   * be anything readable from a Spark path, including local filesystems, HDFS, S3, or others.
   *
   * @param path a path from which value sets will be loaded
   * @return an instance of ValueSets that includes the contents from that directory.
   */
  public C withValueSetsFromDirectory(String path) {

    return withValueSets(valueSetDatasetFromDirectory(path));
  }

  /**
   * Returns all value sets that are disjoint with value sets stored in the default database and
   * adds them to our collection. The directory may be anything readable from a Spark path,
   * including local filesystems, HDFS, S3, or others.
   *
   * @param path a path from which disjoint value sets will be loaded
   * @return an instance of ValueSets that includes content from that directory that is disjoint
   *         with content already contained in the default database.
   */
  public C withDisjointValueSetsFromDirectory(String path) {

    return withDisjointValueSetsFromDirectory(path, VALUE_SETS_DATABASE);
  }

  /**
   * Returns all value sets that are disjoint with value sets stored in the given database and
   * adds them to our collection. The directory may be anything readable from a Spark path,
   * including local filesystems, HDFS, S3, or others.
   *
   * @param path a path from which disjoint value sets will be loaded
   * @param database the database to check value sets against
   * @return an instance of ValueSets that includes content from that directory that is disjoint
   *         with content already contained in the given database.
   */
  public C withDisjointValueSetsFromDirectory(String path, String database) {

    Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
        .select("url", "version")
        .distinct()
        .as(URL_AND_VERSION_ENCODER)
        .alias("current");

    Dataset<Row> valueSets = valueSetDatasetFromDirectory(path)
        .alias("new")
        .join(currentMembers, col("new.url").equalTo(col("current.url"))
                .and(col("new.version").equalTo(col("current.version"))),
            "leftanti");

    return withValueSets(valueSets);
  }

  private static class ToValueSet implements Function<Tuple2<String, String>, Row> {

    private FhirVersionEnum fhirVersion;

    private transient IParser xmlParser;

    private transient IParser jsonParser;

    private transient SparkRowConverter converter;

    ToValueSet(FhirVersionEnum fhirVersion) {
      this.fhirVersion = fhirVersion;

      xmlParser = FhirContexts.contextFor(fhirVersion).newXmlParser();
      jsonParser = FhirContexts.contextFor(fhirVersion).newJsonParser();

      converter = SparkRowConverter.forResource(FhirContexts.contextFor(fhirVersion),
          "ValueSet");
    }

    private void writeObject(java.io.ObjectOutputStream stream) throws IOException {

      stream.defaultWriteObject();
    }

    private void readObject(java.io.ObjectInputStream stream) throws IOException,
        ClassNotFoundException {

      stream.defaultReadObject();

      xmlParser = FhirContexts.contextFor(fhirVersion).newXmlParser();
      jsonParser = FhirContexts.contextFor(fhirVersion).newJsonParser();

      converter = SparkRowConverter.forResource(FhirContexts.contextFor(fhirVersion),
          "ValueSet");
    }

    @Override
    public Row call(Tuple2<String, String> fileContentTuple) throws Exception {

      String filePath = fileContentTuple._1.toLowerCase();

      IBaseResource resource;

      if (filePath.endsWith(".xml")) {

        resource = xmlParser.parseResource(fileContentTuple._2());

      } else if (filePath.endsWith(".json")) {

        resource = jsonParser.parseResource(fileContentTuple._2());

      } else {

        throw new RuntimeException("Unrecognized file extension for resource: " + filePath);
      }

      return converter.resourceToRow(resource);
    }
  }

  /**
   * Returns a dataset of ValueSet from the content stored at the given directory.
   *
   * @param path the path containting the value sets
   * @return a dataframe of loaded value sets in row form.
   */
  protected Dataset<Row> valueSetDatasetFromDirectory(String path) {

    JavaRDD<Tuple2<String,String>> fileNamesAndContents = this.spark.sparkContext()
        .wholeTextFiles(path, 1)
        .toJavaRDD();

    return this.spark.createDataFrame(
        fileNamesAndContents.map(new ToValueSet(fhirVersion)),
        valueSetRowConverter.getSchema());
  }

  /**
   * Writes the the value sets to the default database "ontologies" using default table names:
   * "values", and "valuesets".
   */
  public void writeToDatabase() {

    writeToDatabase(VALUE_SETS_DATABASE);
  }

  /**
   * Writes the value sets to the given database using default table names: "values", "valuesets",
   * and "ancestors".
   *
   * @param database the name of the database to which the value sets are saved
   */
  public void writeToDatabase(String database) {

    writeToTables(database + "." + VALUES_TABLE,
        database + "." + VALUE_SETS_TABLE);
  }

  /**
   * Writes value sets to the given tables.
   *
   * <p>Warning: these updates are likely <em>not</em> atomic due to the lack of transactional
   * semantics in the underlying data store. Concurrent users may see previous items
   * removed before new ones are added, or items appear separately than others. This is intended
   * for use in a user-specific sandbox or staging environment.
   *
   * @param valuesTable name of the table to which the value records are saved
   * @param valueSetTable name of the table to which the value set metadata is saved
   */
  public void writeToTables(String valuesTable, String valueSetTable) {

    boolean hasExistingValueSets;

    try {

      spark.sql("DESCRIBE TABLE " + valueSetTable);

      hasExistingValueSets = true;

    } catch (Exception describeException) {

      // Checked exceptions when calling into Scala upset the Java compiler,
      // hence the need for this workaround and re-throw to propagate unexpected
      // failures.
      if (describeException instanceof NoSuchTableException) {

        hasExistingValueSets = false;

      } else {

        throw new RuntimeException(describeException);
      }
    }

    // If the target tables do not exist, we create them. The values and ancestors tables are
    // created explicitly to meet our partitioning system
    if (!hasExistingValueSets) {

      createValuesTable(spark, valuesTable, null);

      JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

      // Create a value set table by writing empty data having the proper schema and properties
      spark.createDataFrame(sparkContext.emptyRDD(), valueSetRowConverter.getSchema())
          .withColumn("timestamp", lit(null).cast("timestamp"))
          .write()
          .format("parquet")
          .partitionBy("timestamp")
          .saveAsTable(valueSetTable);

    }

    // Check existing value set URIs and Versions for duplicates among the new members
    Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable)
        .select("url", "version")
        .distinct()
        .as(URL_AND_VERSION_ENCODER);

    if (hasDuplicateUrlAndVersions(currentMembers)) {

      throw new IllegalArgumentException("The given value sets contains duplicate url and versions "
          + "against value sets already stored in the table, " + valueSetTable);
    }

    writeValuesToTable(this.values, valuesTable);

    this.valueSets.write()
        .mode(SaveMode.ErrorIfExists)
        .insertInto(valueSetTable);
  }

  /**
   * Creates a table of value records partitioned by valueseturi and valuesetversion.
   *
   * @param spark the spark session
   * @param tableName the name of the values table
   * @param location the location to store the table, or null to create a Hive-managed table
   * @throws IllegalArgumentException if the table name or location are malformed
   */
  private static void createValuesTable(SparkSession spark, String tableName, String location) {

    if (!TABLE_NAME_PATTERN.matcher(tableName).matches()) {
      throw new IllegalArgumentException("Invalid table name: " + tableName);
    }

    // Hive will check for well-formed paths, so we just ensure a user isn't attempting to inject
    // additional SQL into the statement
    if (location != null && location.contains(";")) {
      throw new IllegalArgumentException("Invalid path for values table: " + location);
    }

    StringBuilder builder = new StringBuilder();

    if (location != null) {

      builder.append("CREATE EXTERNAL TABLE IF NOT EXISTS ");

    } else {

      builder.append("CREATE TABLE IF NOT EXISTS ");
    }

    builder.append(tableName);

    // Note the partitioned by columns are deliberately lower case here since Spark does not appear
    // to match columns to Hive partitions if they are not
    builder.append("(system STRING, "
        + "version STRING, "
        + "value STRING)\n"
        + "PARTITIONED BY (valueseturi STRING, valuesetversion STRING)\n");

    builder.append("STORED AS PARQUET\n");

    if (location != null) {
      builder.append("LOCATION '")
          .append(location)
          .append("'");
    }

    spark.sql(builder.toString());
  }

  /**
   * Writes value records to a table. This class ensures the columns and partitions are mapped
   * properly, and is a workaround similar to the problem described <a
   * href="http://stackoverflow.com/questions/35313077/pyspark-order-of-column-on-write-to-mysql-with-jdbc">here</a>.
   *
   * @param values a dataset of value records
   * @param tableName the table to write them to
   */
  private static void writeValuesToTable(Dataset<Value> values, String tableName) {

    // Note the last two columns here must be the partitioned-by columns in order and in lower case
    // for Spark to properly match them to the partitions
    Dataset<Row> orderColumnDataset = values.select("system",
        "version",
        "value",
        "valueseturi",
        "valuesetversion");

    orderColumnDataset.write()
        .mode(SaveMode.ErrorIfExists)
        .insertInto(tableName);
  }
}