Java Code Examples for org.apache.spark.sql.Dataset#drop()

The following examples show how to use org.apache.spark.sql.Dataset#drop() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParseJSONDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String parsedStructTemporaryFieldName = "__parsed_json";

  Dataset<Row> dependency = dependencies.get(stepName);

  Dataset<Row> parsed = dependency.select(
      functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName));

  if (asStruct) {
    return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName);
  }
  else {
    for (StructField parsedField : schema.fields()) {
      parsed = parsed.withColumn(
          parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name()));
    }

    return parsed.drop(parsedStructTemporaryFieldName);
  }
}
 
Example 2
Source File: CreateColumnsFromJsonStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private Dataset<Row> doFilterJsonVariables(Dataset<Row> dataset, SparkRunnerConfig config) {
    //read all variables to filter again. They contain also variables that resulted from Json parsing and are not columns, so they can just be dropped
    List<String> variablesToFilter = new ArrayList<>();

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) {
                if(!vc.isUseVariable()) {
                    variablesToFilter.add(vc.getVariableName());

                    if(Arrays.asList(dataset.columns()).contains(vc.getVariableName())) {
                        BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out after json processing. Comment: " + vc.getComment());
                    }
                }
            }
        }
    }

    dataset = dataset.drop(BpmnaiUtils.getInstance().asSeq(variablesToFilter));

    return dataset;
}
 
Example 3
Source File: SelectDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  dependencyCheck(dependencies);
  Dataset<Row> sourceStep = dependencies.get(stepName);
  if (useIncludeFields){
      if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){
          throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      String firstCol = includeFields.get(0);
      includeFields.remove(0);
      return sourceStep.select(firstCol, includeFields.toArray(new String[0]));
  } else {
      if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){
          throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq());
  }
}
 
Example 4
Source File: PdbjMineDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Fetches data using the PDBj Mine 2 SQL service
 * 
 * @param sqlQuery
 *            query in SQL format
 * @throws IOException
 */
public static Dataset<Row> getDataset(String sqlQuery) throws IOException {
	String encodedSQL = URLEncoder.encode(sqlQuery, "UTF-8");

	URL u = new URL(SERVICELOCATION + "?format=csv&q=" + encodedSQL);
	InputStream in = u.openStream();

	// save as a temporary CSV file
	Path tempFile = Files.createTempFile(null, ".csv");
	Files.copy(in, tempFile, StandardCopyOption.REPLACE_EXISTING);
	in.close();

	SparkSession spark = SparkSession.builder().getOrCreate();

	// load temporary CSV file into Spark dataset
	Dataset<Row> ds = spark.read().format("csv").option("header", "true").option("inferSchema", "true")
			// .option("parserLib", "UNIVOCITY")
			.load(tempFile.toString());

	// rename/concatenate columns to assign
	// consistent primary keys to datasets
	List<String> columns = Arrays.asList(ds.columns());

	if (columns.contains("pdbid")) {
		// this project uses upper case pdbids
		ds = ds.withColumn("pdbid", upper(col("pdbid")));

		if (columns.contains("chain")) {
			ds = ds.withColumn("structureChainId", concat(col("pdbid"), lit("."), col("chain")));
			ds = ds.drop("pdbid", "chain");
		} else {
			ds = ds.withColumnRenamed("pdbid", "structureId");
		}
	}

	return ds;
}
 
Example 5
Source File: AggregateActivityInstancesStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    //apply first and processState aggregator
    Map<String, String> aggregationMap = new HashMap<>();
    for(String column : dataset.columns()) {
        if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) {
            continue;
        } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) {
            aggregationMap.put(column, "max");
        } else if(column.equals(BpmnaiVariables.VAR_STATE)) {
            aggregationMap.put(column, "ProcessState");
        } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) {
            //ignore it, as we aggregate by it
            continue;
        } else {
            aggregationMap.put(column, "AllButEmptyString");
        }
    }

    //first aggregation
    //activity level, take only processInstance and activityInstance rows
    dataset = dataset
            .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE))
            .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID)
            .agg(aggregationMap);

    //rename back columns after aggregation
    String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)";
    Pattern r = Pattern.compile(pattern);

    for(String columnName : dataset.columns()) {
        Matcher m = r.matcher(columnName);
        if(m.find()) {
            String newColumnName = m.group(2);
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }


    //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone
    dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE);

    dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME);

    dataset.cache();
    BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances.");

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config);
    }

    //return preprocessed data
    return dataset;
}
 
Example 6
Source File: ColumnRemoveStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {

    //these columns have to stay in in order to do the processing
    List<String> columnsToKeep = new ArrayList<>();
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID);
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE);
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION);
    columnsToKeep.add(BpmnaiVariables.VAR_STATE);
    columnsToKeep.add(BpmnaiVariables.VAR_LONG);
    columnsToKeep.add(BpmnaiVariables.VAR_DOUBLE);
    columnsToKeep.add(BpmnaiVariables.VAR_TEXT);
    columnsToKeep.add(BpmnaiVariables.VAR_TEXT2);
    columnsToKeep.add(BpmnaiVariables.VAR_DATA_SOURCE);

    List<String> columnsToRemove = new ArrayList<>();

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(ColumnConfiguration cc : preprocessingConfiguration.getColumnConfiguration()) {
                if(!cc.isUseColumn()) {
                    if(columnsToKeep.contains(cc.getColumnName())) {
                        BpmnaiLogger.getInstance().writeWarn("The column '" + cc.getColumnName() + "' has to stay in in order to do the processing. It will not be removed. Comment: " + cc.getComment());
                    } else {
                        columnsToRemove.add(cc.getColumnName());
                        BpmnaiLogger.getInstance().writeInfo("The column '" + cc.getColumnName() + "' will be removed. Comment: " + cc.getComment());
                    }
                }
            }
        }
    }

    //check if all variables that should be filtered actually exist, otherwise log a warning
    List<String> existingColumns = new ArrayList<>(Arrays.asList(dataSet.columns()));

    columnsToRemove
            .stream()
            .forEach(new Consumer<String>() {
                @Override
                public void accept(String s) {
                    if(!existingColumns.contains(s)) {
                        // log the fact that a variable that should be filtered does not exist
                        BpmnaiLogger.getInstance().writeWarn("The column '" + s + "' is configured to be filtered, but does not exist in the data.");
                    }
                }
            });

    dataSet = dataSet.drop(BpmnaiUtils.getInstance().asSeq(columnsToRemove));

    return dataSet;
}
 
Example 7
Source File: AddVariableColumnsStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private Dataset<Row> doAddVariableColumns(Dataset<Row> dataset, boolean writeStepResultIntoFile, String dataLevel, SparkRunnerConfig config) {
    Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED);
    Set<String> variables = varMap.keySet();

    for(String v : variables) {
        dataset = dataset.withColumn(v, when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).equalTo(v),
                when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("string"), dataset.col(BpmnaiVariables.VAR_TEXT))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("null"), dataset.col(BpmnaiVariables.VAR_TEXT))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("boolean"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("integer"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("long"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("double"), dataset.col(BpmnaiVariables.VAR_DOUBLE))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("date"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .otherwise(dataset.col(BpmnaiVariables.VAR_TEXT2)))
                .otherwise(null));

        //rev count is only relevant on process level
        if(dataLevel.equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled()) {
            dataset = dataset.withColumn(v+"_rev",
                    when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).equalTo(v), dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION))
                            .otherwise("0"));
        }
    }

    //drop unnecesssary columns
    dataset = dataset.drop(
            BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE,
            BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION,
            BpmnaiVariables.VAR_DOUBLE,
            BpmnaiVariables.VAR_LONG,
            BpmnaiVariables.VAR_TEXT,
            BpmnaiVariables.VAR_TEXT2);

    if(!config.isDevProcessStateColumnWorkaroundEnabled()) {
        dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    }

    if(writeStepResultIntoFile) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "add_var_columns", config);
    }

    //return preprocessed data
    return dataset;
}
 
Example 8
Source File: DataFilterOnActivityStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * @param dataSet the incoming dataset for this processing step
 * @param parameters
 * @return the filtered DataSet
 */
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {
    // any parameters set?
    if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep");
        return dataSet;
    }

    // get query parameter
    String query = (String) parameters.get("query");
    BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + ".");

    // save size of initial dataset for log
    dataSet.cache();
    Long initialDSCount = dataSet.count();

    // repartition by process instance and order by start_time for this operation
    dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME);

    // we temporarily store variable updates (rows with a var type set) separately.
    Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull());
    //find first occurrence of activity instance
    final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances.
    final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant.
    List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList();
    Map<String, String> activities = activityRows.stream().collect(Collectors.toMap(
            r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME)));
    // broadcasting the PID - Start time Map to use it in a user defined function
    SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities);

    // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity.
    // We first narrow it down to the process instances in question
    Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray()));
    // Then, we mark all events that should be removed
    Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity",
            callUDF("activityBeforeTimestamp",
                    selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID),
                    selectedProcesses.col(BpmnaiVariables.VAR_START_TIME)));
    // And we keep the rest
    activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE"));
    // Clean up
    activityDataSet = activityDataSet.drop("data_filter_on_activity");

    // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset
    // first, we narrow it down to keep only variables that have a corresponding activity instance
    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");

    variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner");

    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID);
    variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");
    dataSet = activityDataSet.union(variables);

    dataSet.cache();
    BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)");

    if (config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config);
    }

    return dataSet;


}
 
Example 9
Source File: PdbToUniProt.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * Returns an up-to-date dataset of PDB to UniProt 
 * residue-level mappings for a list of ids.
 * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainId (e.g., 1XYZ.A).
 * This method reads a cached file and downloads updates.
 * 
 * @param ids list of pdbIds or pdbId.chainIds
 * @return dataset of PDB to UniProt residue-level mappings
 * @throws IOException
 */
public static Dataset<Row> getResidueMappings(List<String> ids) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();
    
    boolean withChainId = ids.size() > 0 && ids.get(0).length() > 4;
    
    // create dataset of ids
    Dataset<Row> df = spark.createDataset(ids, Encoders.STRING()).toDF("id");
    // get cached mappings
    Dataset<Row> mapping = getCachedResidueMappings();  
    
    // dataset for non-cached mappings
    Dataset<Row> notCached = null;
    // dataset with PDB Ids to be downloaded
    Dataset<Row> toDownload = null; 
    
    if (withChainId) {
        // get subset of requested ids from cached dataset
        mapping = mapping.join(df, mapping.col("structureChainId").equalTo(df.col("id"))).drop("id");
        // get ids that are not in the cached dataset
        notCached = df.join(mapping, df.col("id").equalTo(mapping.col("structureChainId")), "left_anti").cache(); 
        // create dataset of PDB Ids to be downloaded
        toDownload = notCached.withColumn("id", col("id").substr(0, 4)).distinct().cache();
    } else {
        // get subset of requested ids from cached dataset
        mapping = mapping.withColumn("pdbId", col("structureChainId").substr(0, 4));
        mapping = mapping.join(df, mapping.col("pdbId").equalTo(df.col("id"))).drop("id");
        // create dataset of PDB Ids to be downloaded
        toDownload = df.join(mapping, df.col("id").equalTo(mapping.col("pdbId")), "left_anti").distinct().cache();
        mapping = mapping.drop("pdbId");
    }
    
    toDownload = toDownload.distinct().cache();
        
    // download data that are not in the cache
    if (toDownload.count() > 0) {
        Dataset<Row> unpData = getChainMappings().select("structureId").distinct();
        toDownload = toDownload.join(unpData, toDownload.col("id").equalTo(unpData.col("structureId"))).drop("structureId").cache();
        System.out.println("Downloading mapping for " + toDownload.count() + " PDB structures.");
        Dataset<Row> downloadedData = downloadData(toDownload);
  
        // since data are downloaded for all chains in structure, make sure to only include the requested chains.
        if (withChainId) {
            downloadedData = downloadedData.join(notCached, downloadedData.col("structureChainId").equalTo(notCached.col("id"))).drop("id");
        }
        mapping = mapping.union(downloadedData);
    }
    
    return mapping;
}
 
Example 10
Source File: HoodieIncrSource.java    From hudi with Apache License 2.0 4 votes vote down vote up
@Override
public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {

  DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.HOODIE_SRC_BASE_PATH));

  /*
   * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
   * Config.HOODIE_SRC_PARTITION_FIELDS)); List<String> partitionFields =
   * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor
   * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS,
   * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
   */
  String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
  int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
  boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
      Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);

  // Use begin Instant if set and non-empty
  Option<String> beginInstant =
      lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty();

  Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
      numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);

  if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
    LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
    return Pair.of(Option.empty(), instantEndpts.getKey());
  }

  // Do Incr pull. Set end instant if available
  DataFrameReader reader = sparkSession.read().format("org.apache.hudi")
      .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
      .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
      .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());

  Dataset<Row> source = reader.load(srcPath);

  /*
   * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
   * 
   * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema
   * = newSchema.add(field, DataTypes.StringType, true); }
   * 
   * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
   * configured
   *
   * Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> { // _hoodie_instant_time String
   * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(),
   * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath
   * = row.getString(3); List<Object> partitionVals =
   * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object)
   * o).collect(Collectors.toList()); ValidationUtils.checkArgument(partitionVals.size() == partitionFields.size(),
   * "#partition-fields != #partition-values-extracted"); List<Object> rowObjs = new
   * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return
   * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema));
   * 
   * log.info("Validated Source Schema :" + validated.schema());
   */

  // Remove Hoodie meta columns except partition path from input source
  final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
      .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
  // log.info("Final Schema from Source is :" + src.schema());
  return Pair.of(Option.of(src), instantEndpts.getRight());
}
 
Example 11
Source File: KuduOutput.java    From envelope with Apache License 2.0 4 votes vote down vote up
@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
  KuduContext kc = new KuduContext(
      config.getString(CONNECTION_CONFIG_NAME), Contexts.getSparkSession().sparkContext());

  String tableName = config.getString(TABLE_CONFIG_NAME);

  Set<String> kuduColumns = null;
  if (KuduUtils.ignoreMissingColumns(config)) {
      try {
        KuduTable table = getConnection().getTable(tableName);
        kuduColumns = Sets.newHashSetWithExpectedSize(table.getSchema().getColumns().size());
        for (int i = 0; i < table.getSchema().getColumns().size(); i++) {
          ColumnSchema columnSchema = table.getSchema().getColumns().get(i);
          kuduColumns.add(columnSchema.getName());
        }
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
  }

  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = plan._2();

    if (KuduUtils.ignoreMissingColumns(config) && kuduColumns != null) {
      Set<String> mutationFields = Sets.newHashSet(mutation.schema().fieldNames());
      for (String col : Sets.difference(mutationFields, kuduColumns)) {
        mutation = mutation.drop(col);
      }
    }

    KuduWriteOptions kuduWriteOptions = new KuduWriteOptions(
        KuduUtils.doesInsertIgnoreDuplicates(config),
        false
    );

    switch (mutationType) {
      case DELETE:
        kc.deleteRows(mutation, tableName, kuduWriteOptions);
        break;
      case INSERT:
        kc.insertRows(mutation, tableName, kuduWriteOptions);
        break;
      case UPDATE:
        kc.updateRows(mutation, tableName, kuduWriteOptions);
        break;
      case UPSERT:
        kc.upsertRows(mutation, tableName, kuduWriteOptions);
        break;
      default:
        throw new RuntimeException("Kudu bulk output does not support mutation type: " + mutationType);
    }
  }
}
 
Example 12
Source File: MLResults.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Obtain an output as a {@code DataFrame} of vectors with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of vectors:
 * </p>
 * <code>[[1.0,2.0]]
 * <br>[[3.0,4.0]]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 13
Source File: MLResults.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Obtain an output as a {@code DataFrame} of doubles with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of doubles:
 * </p>
 * <code>[1.0,2.0]
 * <br>[3.0,4.0]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 14
Source File: MLResults.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Obtain an output as a {@code DataFrame} of vectors with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of vectors:
 * </p>
 * <code>[[1.0,2.0]]
 * <br>[[3.0,4.0]]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 15
Source File: MLResults.java    From systemds with Apache License 2.0 3 votes vote down vote up
/**
 * Obtain an output as a {@code DataFrame} of doubles with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of doubles:
 * </p>
 * <code>[1.0,2.0]
 * <br>[3.0,4.0]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 16
Source File: Matrix.java    From systemds with Apache License 2.0 2 votes vote down vote up
/**
 * Obtain the matrix as a {@code DataFrame} of doubles with no ID column
 *
 * @return the matrix as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> toDFDoubleNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 17
Source File: Matrix.java    From systemds with Apache License 2.0 2 votes vote down vote up
/**
 * Obtain the matrix as a {@code DataFrame} of vectors with no ID column
 *
 * @return the matrix as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> toDFVectorNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 18
Source File: Matrix.java    From systemds with Apache License 2.0 2 votes vote down vote up
/**
 * Obtain the matrix as a {@code DataFrame} of vectors with no ID column
 *
 * @return the matrix as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> toDFVectorNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
 
Example 19
Source File: Matrix.java    From systemds with Apache License 2.0 2 votes vote down vote up
/**
 * Obtain the matrix as a {@code DataFrame} of doubles with no ID column
 *
 * @return the matrix as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> toDFDoubleNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}