Java Code Examples for org.apache.spark.api.java.function.FilterFunction

The following examples show how to use org.apache.spark.api.java.function.FilterFunction. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may want to check out the right sidebar which shows the related API usage.
Example 1
Source Project: bunsen   Source File: Hierarchies.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns the collection of ancestors from the table in the given database.
 *
 * @param spark the spark session
 * @param database name of the database containing the ancestors table
 * @return a Hierarchies instance.
 */
public static Hierarchies getFromDatabase(SparkSession spark, String database) {

  Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE)
      .as(ANCESTOR_ENCODER);

  Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor ->
          ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX))
      .select(col("uri").alias("url"), col("version"))
      .distinct()
      .as(URI_AND_VERSION_ENCODER);

  return new Hierarchies(spark,
      members,
      ancestors);
}
 
Example 2
private Dataset<Row> doFilterVariables(Dataset<Row> dataset, boolean writeStepResultIntoFile, SparkRunnerConfig config) {
    List<String> variablesToFilter = new ArrayList<>();

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) {
                if(!vc.isUseVariable()) {
                    variablesToFilter.add(vc.getVariableName());
                    BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out. Comment: " + vc.getComment());
                }
            }
        }

    }

    //check if all variables that should be filtered actually exist, otherwise log a warning
    List<Row> existingVariablesRows = dataset.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).distinct().collectAsList();
    List<String> existingVariables = existingVariablesRows
            .stream()
            .map(r -> r.getString(0)).collect(Collectors.toList());

    variablesToFilter
            .stream()
            .forEach(new Consumer<String>() {
                @Override
                public void accept(String s) {
                    if(!existingVariables.contains(s)) {
                        // log the fact that a variable that should be filtered does not exist
                        BpmnaiLogger.getInstance().writeWarn("The variable '" + s + "' is configured to be filtered, but does not exist in the data.");
                    }
                }
            });

    dataset = dataset.filter((FilterFunction<Row>) row -> {
        // keep the row if the variable name column does not contain a value that should be filtered
        String variable = row.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);

        //TODO: cleanup
        boolean keep = !variablesToFilter.contains(variable);
        if(variable != null && variable.startsWith("_CORRELATION_ID_")) {
            keep = false;
        }

        return keep;
    });

    if(writeStepResultIntoFile) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "variable_filter", config);
    }

    return dataset;
}
 
Example 3
Source Project: bunsen   Source File: AbstractConceptMaps.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Returns a dataset with the mappings for each uri and version.
 *
 * @param uriToVersion a map of concept map URI to the version to load
 * @return a dataset of mappings for the given URIs and versions.
 */
public Dataset<Mapping> getMappings(Map<String,String> uriToVersion) {

  JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

  Broadcast<Map<String,String>> broadcastMaps = context.broadcast(uriToVersion);

  return this.mappings.filter((FilterFunction<Mapping>) mapping -> {

    String latestVersion = broadcastMaps.getValue().get(mapping.getConceptMapUri());

    return latestVersion != null && latestVersion.equals(mapping.getConceptMapVersion());
  });
}
 
Example 4
Source Project: bunsen   Source File: AbstractValueSets.java    License: Apache License 2.0 3 votes vote down vote up
/**
 * Returns a dataset with the values for each element in the map of uri to version.
 *
 * @param uriToVersion a map of value set URI to the version to load
 * @return a dataset of values for the given URIs and versions.
 */
public Dataset<Value> getValues(Map<String,String> uriToVersion) {

  JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

  Broadcast<Map<String,String>> broadcastUrisToVersion = context.broadcast(uriToVersion);

  return this.values.filter((FilterFunction<Value>) value -> {

    String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri());

    return latestVersion != null && latestVersion.equals(value.getValueSetVersion());
  });
}