
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.lit;

import ca.uhn.fhir.context.FhirVersionEnum;
import ca.uhn.fhir.parser.IParser;
import com.cerner.bunsen.FhirContexts;
import com.cerner.bunsen.spark.SparkRowConverter;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.hl7.fhir.instance.model.api.IBaseResource;
import scala.Tuple2;

 * This is a partial implementation of logic to manage FHIR ValueSets. It is designed to
 * encapsulate as much functionality as possible while remaining independent of specific FHIR
 * versions. Users should generally not use this class directly, but rather consume the subclass
 * that corresponds to the FHIR version they are using.
 * @param <T> the type of the FHIR ValueSet objects being used
 * @param <C> the type of the subclass of this class being used.
public abstract class AbstractValueSets<T extends IBaseResource,C extends AbstractValueSets<T,C>> {

   * An encoder for serializing values.
  protected static final Encoder<Value> VALUE_ENCODER = Encoders.bean(Value.class);

  protected static final Encoder<UrlAndVersion> URL_AND_VERSION_ENCODER =

  protected static final Pattern TABLE_NAME_PATTERN =

   * Returns the encoder for UrlAndVersion tuples.
   * @return the encoder for UrlAndVersion tuples.
  public static Encoder<UrlAndVersion> getUrlAndVersionEncoder() {

   * Returns the encoder for values.
   * @return the encoder for values.
  public static Encoder<Value> getValueEncoder() {
    return AbstractValueSets.VALUE_ENCODER;

   * Default database name where the value sets information is stored.
  public static final String VALUE_SETS_DATABASE = "ontologies";

   * Default table name where the expanded values information is stored.
  public static final String VALUES_TABLE = "values";

   * Default table name where value sets metadata is stored.
  public static final String VALUE_SETS_TABLE = "valuesets";

  protected final SparkSession spark;

  protected final FhirVersionEnum fhirVersion;

   * URI and Version metadata used to preserve uniqueness among value sets.
  protected final Dataset<UrlAndVersion> members;

  protected final Dataset<Value> values;

  protected final Dataset<Row> valueSets;

  protected final SparkRowConverter valueSetRowConverter;

  protected AbstractValueSets(SparkSession spark,
      FhirVersionEnum fhirVersion,
      Dataset<UrlAndVersion> members,
      Dataset<Row> valueSets,
      Dataset<Value> values,
      SparkRowConverter valueSetRowConverter) {

    this.spark = spark;
    this.fhirVersion = fhirVersion;
    this.members = members;
    this.valueSets = valueSets;
    this.values = values;
    this.valueSetRowConverter = valueSetRowConverter;

   * Returns the latest version of all value sets.
   * @param includeExperimental whether to include value sets marked as experimental
   * @return a map of value set URIs to the latest version for them.
  public Map<String,String> getLatestVersions(boolean includeExperimental) {

    return getLatestVersions(null, includeExperimental);

   * Returns the latest versions of a given set of value sets.
   * @param uris a set of URIs for which to retrieve the latest versions, or null to load them all
   * @param includeExperimental whether to include value sets marked as experimental
   * @return a map of value set URIs to the latest versions for them.
  public Map<String,String> getLatestVersions(final Set<String> uris, boolean includeExperimental) {

    // Reduce by the concept map URI to return only the latest version
    // per concept map. Spark's provided max aggregation function
    // only works on numeric types, so we jump into RDDs and perform
    // the reduce by hand.
    JavaRDD<UrlAndVersion> members ="url", "version", "experimental")
        .filter(row -> (uris == null || uris.contains(row.getString(0)))
            && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2)))
        .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1)))
        .reduceByKey((leftVersion, rightVersion) ->
            leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion)
        .map(tuple -> new UrlAndVersion(tuple._1, tuple._2));

    return spark.createDataset(members.rdd(), URL_AND_VERSION_ENCODER)

   * Returns a dataset with the latest values for each valueset of the given uris.
   * @param uris URIs for the value sets
   * @param includeExperimental whether to include value sets marked as experimental
   * @return a dataset of the latest mappings for them.
  public Dataset<Value> getLatestValues(Set<String> uris, boolean includeExperimental) {

    // Since mappings are partitioned by URL and version, in most cases it is more efficient to load
    // separately for each partition and union the results.
    Map<String,String> latestVersions = getLatestVersions(uris, includeExperimental);

    return getValues(latestVersions);

   * Returns a dataset of all values in this collection. This is generally used for inspection and
   * debugging of values.
   * @return a dataset of all values.
  public Dataset<Value> getValues() {
    return this.values;

   * Returns the values for the given URI and version.
   * @param uri the uri of the value set for which we get values
   * @param version the version of the value set for which we get values
   * @return a dataset of values for the given URI and version.
  public Dataset<Value> getValues(String uri, String version) {

    return this.values.where(col("valueseturi").equalTo(lit(uri))

   * Returns a dataset with the values for each element in the map of uri to version.
   * @param uriToVersion a map of value set URI to the version to load
   * @return a dataset of values for the given URIs and versions.
  public Dataset<Value> getValues(Map<String,String> uriToVersion) {

    JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

    Broadcast<Map<String,String>> broadcastUrisToVersion = context.broadcast(uriToVersion);

    return this.values.filter((FilterFunction<Value>) value -> {

      String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri());

      return latestVersion != null && latestVersion.equals(value.getValueSetVersion());

   * Returns a dataset of value sets to inspect metadata. Since the value sets themselves can be
   * quite large, the values in this dataset do not contain them. Instead, users should use the
   * {@link #getValues()} method to query values in depth.
   * @return a dataset of value sets that do not contain concept values.
  public Dataset<Row> getValueSets() {
    return this.valueSets;

   * Returns the value set with the given uri and version, or null if there is no such value set.
   * @param uri the uri of the value set to return
   * @param version the version of the value set to return
   * @return the specified value set.
  public T getValueSet(String uri, String version) {

    // Load the value sets, which may contain zero items if the value set does not exist

    // Typecast necessary to placate the Java compiler calling this Scala function
    Row[] valuesetRows = (Row[]) this.valueSets.filter(

    if (valuesetRows.length == 0) {

      return null;

    } else {

      T valueSet = (T) valueSetRowConverter.rowToResource(valuesetRows[0]);

      Dataset<Value> filteredValues = getValues(uri, version);

      addToValueSet(valueSet, filteredValues);

      return valueSet;

   * Adds the given values to the given value set instance.
   * @param valueSet the value set to add.
   * @param values the values to add.
  protected abstract void addToValueSet(T valueSet, Dataset<Value> values);

   * Returns a dataset of distinct URL and version tuples.
   * @param valueSets valueSets to scan for the URL and version.
   * @return the distinct URL and versions for the value sets.
  protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<Row> valueSets) {

    return"url", "version")

   * Returns true if the UrlAndVersions if the membersToCheck has any duplicates with the members
   * of this value sets instance.
   * @param membersToCheck the members to check for duplicates
   * @return true if there are duplicate URL and versions, false otherwise.
  protected boolean hasDuplicateUrlAndVersions(Dataset<UrlAndVersion> membersToCheck) {

    return this.members.intersect(membersToCheck).count() > 0;

   * Returns a new ValueSets instance that includes the given value sets.
   * @param valueSets the value sets to add to the returned collection.
   * @return a new ValueSets instance with the added value sets.
  public abstract C withValueSets(Dataset<Row> valueSets);

   * Returns a new ValueSets instance that includes the given value sets.
   * @param valueSets the value sets to add to the returned collection.
   * @return a new ValueSets instance with the added value sets.
  public C withValueSets(T... valueSets) {

    return withValueSets(Arrays.asList(valueSets));

   * Returns a new ValueSets instance that includes the given value sets.
   * @param valueSets the value sets to add to the returned collection.
   * @return a new ValueSets instance with the added value sets.
  public C withValueSets(List<T> valueSets) {

    List<Row> rows =
        .map(valueSet -> valueSetRowConverter.resourceToRow(valueSet))

    return withValueSets(this.spark.createDataFrame(rows,

   * Reads all value sets from a given directory and adds them to our collection. The directory may
   * be anything readable from a Spark path, including local filesystems, HDFS, S3, or others.
   * @param path a path from which value sets will be loaded
   * @return an instance of ValueSets that includes the contents from that directory.
  public C withValueSetsFromDirectory(String path) {

    return withValueSets(valueSetDatasetFromDirectory(path));

   * Returns all value sets that are disjoint with value sets stored in the default database and
   * adds them to our collection. The directory may be anything readable from a Spark path,
   * including local filesystems, HDFS, S3, or others.
   * @param path a path from which disjoint value sets will be loaded
   * @return an instance of ValueSets that includes content from that directory that is disjoint
   *         with content already contained in the default database.
  public C withDisjointValueSetsFromDirectory(String path) {

    return withDisjointValueSetsFromDirectory(path, VALUE_SETS_DATABASE);

   * Returns all value sets that are disjoint with value sets stored in the given database and
   * adds them to our collection. The directory may be anything readable from a Spark path,
   * including local filesystems, HDFS, S3, or others.
   * @param path a path from which disjoint value sets will be loaded
   * @param database the database to check value sets against
   * @return an instance of ValueSets that includes content from that directory that is disjoint
   *         with content already contained in the given database.
  public C withDisjointValueSetsFromDirectory(String path, String database) {

    Dataset<UrlAndVersion> currentMembers = this.spark.table(database + "." + VALUE_SETS_TABLE)
        .select("url", "version")

    Dataset<Row> valueSets = valueSetDatasetFromDirectory(path)
        .join(currentMembers, col("new.url").equalTo(col("current.url"))

    return withValueSets(valueSets);

  private static class ToValueSet implements Function<Tuple2<String, String>, Row> {

    private FhirVersionEnum fhirVersion;

    private transient IParser xmlParser;

    private transient IParser jsonParser;

    private transient SparkRowConverter converter;

    ToValueSet(FhirVersionEnum fhirVersion) {
      this.fhirVersion = fhirVersion;

      xmlParser = FhirContexts.contextFor(fhirVersion).newXmlParser();
      jsonParser = FhirContexts.contextFor(fhirVersion).newJsonParser();

      converter = SparkRowConverter.forResource(FhirContexts.contextFor(fhirVersion),

    private void writeObject( stream) throws IOException {


    private void readObject( stream) throws IOException,
        ClassNotFoundException {


      xmlParser = FhirContexts.contextFor(fhirVersion).newXmlParser();
      jsonParser = FhirContexts.contextFor(fhirVersion).newJsonParser();

      converter = SparkRowConverter.forResource(FhirContexts.contextFor(fhirVersion),

    public Row call(Tuple2<String, String> fileContentTuple) throws Exception {

      String filePath = fileContentTuple._1.toLowerCase();

      IBaseResource resource;

      if (filePath.endsWith(".xml")) {

        resource = xmlParser.parseResource(fileContentTuple._2());

      } else if (filePath.endsWith(".json")) {

        resource = jsonParser.parseResource(fileContentTuple._2());

      } else {

        throw new RuntimeException("Unrecognized file extension for resource: " + filePath);

      return converter.resourceToRow(resource);

   * Returns a dataset of ValueSet from the content stored at the given directory.
   * @param path the path containting the value sets
   * @return a dataframe of loaded value sets in row form.
  protected Dataset<Row> valueSetDatasetFromDirectory(String path) {

    JavaRDD<Tuple2<String,String>> fileNamesAndContents = this.spark.sparkContext()
        .wholeTextFiles(path, 1)

    return this.spark.createDataFrame( ToValueSet(fhirVersion)),

   * Writes the the value sets to the default database "ontologies" using default table names:
   * "values", and "valuesets".
  public void writeToDatabase() {


   * Writes the value sets to the given database using default table names: "values", "valuesets",
   * and "ancestors".
   * @param database the name of the database to which the value sets are saved
  public void writeToDatabase(String database) {

    writeToTables(database + "." + VALUES_TABLE,
        database + "." + VALUE_SETS_TABLE);

   * Writes value sets to the given tables.
   * <p>Warning: these updates are likely <em>not</em> atomic due to the lack of transactional
   * semantics in the underlying data store. Concurrent users may see previous items
   * removed before new ones are added, or items appear separately than others. This is intended
   * for use in a user-specific sandbox or staging environment.
   * @param valuesTable name of the table to which the value records are saved
   * @param valueSetTable name of the table to which the value set metadata is saved
  public void writeToTables(String valuesTable, String valueSetTable) {

    boolean hasExistingValueSets;

    try {

      spark.sql("DESCRIBE TABLE " + valueSetTable);

      hasExistingValueSets = true;

    } catch (Exception describeException) {

      // Checked exceptions when calling into Scala upset the Java compiler,
      // hence the need for this workaround and re-throw to propagate unexpected
      // failures.
      if (describeException instanceof NoSuchTableException) {

        hasExistingValueSets = false;

      } else {

        throw new RuntimeException(describeException);

    // If the target tables do not exist, we create them. The values and ancestors tables are
    // created explicitly to meet our partitioning system
    if (!hasExistingValueSets) {

      createValuesTable(spark, valuesTable, null);

      JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

      // Create a value set table by writing empty data having the proper schema and properties
      spark.createDataFrame(sparkContext.emptyRDD(), valueSetRowConverter.getSchema())
          .withColumn("timestamp", lit(null).cast("timestamp"))


    // Check existing value set URIs and Versions for duplicates among the new members
    Dataset<UrlAndVersion> currentMembers = this.spark.table(valueSetTable)
        .select("url", "version")

    if (hasDuplicateUrlAndVersions(currentMembers)) {

      throw new IllegalArgumentException("The given value sets contains duplicate url and versions "
          + "against value sets already stored in the table, " + valueSetTable);

    writeValuesToTable(this.values, valuesTable);


   * Creates a table of value records partitioned by valueseturi and valuesetversion.
   * @param spark the spark session
   * @param tableName the name of the values table
   * @param location the location to store the table, or null to create a Hive-managed table
   * @throws IllegalArgumentException if the table name or location are malformed
  private static void createValuesTable(SparkSession spark, String tableName, String location) {

    if (!TABLE_NAME_PATTERN.matcher(tableName).matches()) {
      throw new IllegalArgumentException("Invalid table name: " + tableName);

    // Hive will check for well-formed paths, so we just ensure a user isn't attempting to inject
    // additional SQL into the statement
    if (location != null && location.contains(";")) {
      throw new IllegalArgumentException("Invalid path for values table: " + location);

    StringBuilder builder = new StringBuilder();

    if (location != null) {

      builder.append("CREATE EXTERNAL TABLE IF NOT EXISTS ");

    } else {

      builder.append("CREATE TABLE IF NOT EXISTS ");


    // Note the partitioned by columns are deliberately lower case here since Spark does not appear
    // to match columns to Hive partitions if they are not
    builder.append("(system STRING, "
        + "version STRING, "
        + "value STRING)\n"
        + "PARTITIONED BY (valueseturi STRING, valuesetversion STRING)\n");

    builder.append("STORED AS PARQUET\n");

    if (location != null) {
      builder.append("LOCATION '")


   * Writes value records to a table. This class ensures the columns and partitions are mapped
   * properly, and is a workaround similar to the problem described <a
   * href="">here</a>.
   * @param values a dataset of value records
   * @param tableName the table to write them to
  private static void writeValuesToTable(Dataset<Value> values, String tableName) {

    // Note the last two columns here must be the partitioned-by columns in order and in lower case
    // for Spark to properly match them to the partitions
    Dataset<Row> orderColumnDataset ="system",
