Java Code Examples for org.apache.spark.sql.Column

The following examples show how to use org.apache.spark.sql.Column. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: RewriteManifestsAction.java    License: Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildManifestEntryDF(List<ManifestFile> manifests) {
  Dataset<Row> manifestDF = spark
      .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING())
      .toDF("manifest");

  String entriesMetadataTable = metadataTableName(MetadataTableType.ENTRIES);
  Dataset<Row> manifestEntryDF = spark.read().format("iceberg")
      .load(entriesMetadataTable)
      .filter("status < 2") // select only live entries
      .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file");

  Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest"));
  return manifestEntryDF
      .join(manifestDF, joinCond, "left_semi")
      .select("snapshot_id", "sequence_number", "data_file");
}
 
Example 2
public Dataset<Row> run(Dataset dataset) {

        //only use configured variables for pipeline
        Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(this.sparkRunnerConfig);
        List<String> predictionVars = configuration.getModelPredictionConfiguration().getPredictionVariables();
        List<Column> usedColumns = new ArrayList<>();
        for(String var : predictionVars) {
            usedColumns.add(new Column(var));
        }
        dataset = dataset.select(BpmnaiUtils.getInstance().asSeq(usedColumns));

        //go through pipe elements
        // Define processing steps to run
        final PreprocessingRunner preprocessingRunner = new PreprocessingRunner();

        for(PipelineStep ps : pipelineManager.getOrderedPipeline()) {
            preprocessingRunner.addPreprocessorStep(ps);
        }

        // Run processing runner
        Dataset<Row> resultDataset = preprocessingRunner.run(dataset, this.sparkRunnerConfig);

        writeConfig();

        return resultDataset;
    }
 
Example 3
Source Project: AWS-MIMIC-IIItoOMOP   Source File: Loader.java    License: Apache License 2.0 6 votes vote down vote up
public void write(String destination, List<String> overflowColumns) throws IOException 
{
    String prefix = configuration.getFileSystem() + "://" + configuration.getDestinationBucket() + "/stage";
    String suffix = destination + ".csv"; 
    
    List<Column> columns = new ArrayList<Column>();
    List<String> strings = new ArrayList<String>();
    
    addMetadata(destination);
    materializeUUID(prefix, suffix);
    
    strings.add("file_location");
    columns.add(col("overlflow_column_uuid_lookup"));
    columns.add(col("file_location"));
    
    for(String column: overflowColumns) { columns.add(col(column)); strings.add(column); }
    
    df.select(JavaConverters.asScalaBufferConverter(columns).asScala()).write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec").mode("overwrite").save(prefix + "/column_overflow/" + suffix);
    
    df.drop(JavaConverters.asScalaBufferConverter(strings).asScala()).write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec").mode("overwrite").save(prefix + "/" + suffix);

    //clean up temp file
    new File(prefix + "/temp/" + suffix).delete();
}
 
Example 4
Source Project: DataVec   Source File: Normalization.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Scale based on min,max
 *
 * @param dataFrame the dataframe to scale
 * @param min       the minimum value
 * @param max       the maximum value
 * @return the normalized dataframe per column
 */
public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) {
    List<String> columnsList = DataFrames.toList(dataFrame.get().columns());
    columnsList.removeAll(skipColumns);
    String[] columnNames = DataFrames.toArray(columnsList);
    //first row is min second row is max, each column in a row is for a particular column
    List<Row> minMax = minMaxColumns(dataFrame, columnNames);
    for (int i = 0; i < columnNames.length; i++) {
        String columnName = columnNames[i];
        double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
        double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
        double maxSubMin = (dMax - dMin);
        if (maxSubMin == 0)
            maxSubMin = 1;

        Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
        dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol));
    }


    return dataFrame;
}
 
Example 5
@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (opKind == PLUS)
        return leftExpr.plus(rightExpr);
    else if (opKind == MINUS)
        return leftExpr.minus(rightExpr);
    else if (opKind == TIMES)
        return leftExpr.multiply(rightExpr);
    else if (opKind == DIVIDE)
        return leftExpr.divide(rightExpr);
    else
        throw new UnsupportedOperationException();
}
 
Example 6
Source Project: hudi   Source File: HoodieSnapshotExporter.java    License: Apache License 2.0 6 votes vote down vote up
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}
 
Example 7
Source Project: envelope   Source File: ParseJSONDeriver.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String parsedStructTemporaryFieldName = "__parsed_json";

  Dataset<Row> dependency = dependencies.get(stepName);

  Dataset<Row> parsed = dependency.select(
      functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName));

  if (asStruct) {
    return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName);
  }
  else {
    for (StructField parsedField : schema.fields()) {
      parsed = parsed.withColumn(
          parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name()));
    }

    return parsed.drop(parsedStructTemporaryFieldName);
  }
}
 
Example 8
Source Project: deeplearning4j   Source File: Normalization.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Scale based on min,max
 *
 * @param dataFrame the dataframe to scale
 * @param min       the minimum value
 * @param max       the maximum value
 * @return the normalized dataframe per column
 */
public static Dataset<Row> normalize(Dataset<Row> dataFrame, double min, double max, List<String> skipColumns) {
    List<String> columnsList = DataFrames.toList(dataFrame.columns());
    columnsList.removeAll(skipColumns);
    String[] columnNames = DataFrames.toArray(columnsList);
    //first row is min second row is max, each column in a row is for a particular column
    List<Row> minMax = minMaxColumns(dataFrame, columnNames);
    for (int i = 0; i < columnNames.length; i++) {
        String columnName = columnNames[i];
        double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
        double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
        double maxSubMin = (dMax - dMin);
        if (maxSubMin == 0)
            maxSubMin = 1;

        Column newCol = dataFrame.col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
        dataFrame = dataFrame.withColumn(columnName, newCol);
    }


    return dataFrame;
}
 
Example 9
@Override
    public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) {
        ProjectRestrictOperation op = (ProjectRestrictOperation) operationContext.getOperation();
        Dataset<Row> df = null;
        // TODO:  Enable the commented try-catch block after regression testing.
        //        This would be a safeguard against unanticipated exceptions:
        //             org.apache.spark.sql.catalyst.parser.ParseException
        //             org.apache.spark.sql.AnalysisException
        //    ... which may occur if the Splice parser fails to detect a
        //        SQL expression which SparkSQL does not support.
        if (op.hasExpressions()) {
//      try {
            df = input.selectExpr(op.getExpressions());
            return Pair.newPair(df, context);
//        }
//        catch (Exception e) {
//        }
        }
        int[] mapping = op.projectMapping;
        Column[] columns = new Column[mapping.length];
        for (int i = 0; i < mapping.length; ++i) {
            columns[i] = input.col("c" + (mapping[i] - 1));
        }
        df = input.select(columns);
        return Pair.newPair(df, context);
    }
 
Example 10
@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (relOpKind == EQUALS_RELOP)
        return leftExpr.equalTo(rightExpr);
    else if (relOpKind == NOT_EQUALS_RELOP)
        return leftExpr.notEqual(rightExpr);
    else if (relOpKind == GREATER_THAN_RELOP)
        return leftExpr.gt(rightExpr);
    else if (relOpKind == GREATER_EQUALS_RELOP)
        return leftExpr.geq(rightExpr);
    else if (relOpKind == LESS_THAN_RELOP)
        return leftExpr.lt(rightExpr);
    else if (relOpKind == LESS_EQUALS_RELOP)
        return leftExpr.leq(rightExpr);
    else if (relOpKind == IS_NULL_RELOP)
        return leftExpr.isNull();
    else if (relOpKind == IS_NOT_NULL_RELOP)
        return leftExpr.isNotNull();
    else
        throw new UnsupportedOperationException();
}
 
Example 11
Source Project: kylin-on-parquet-v2   Source File: NSparkCubingUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static Column[] getColumns(Set<Integer> indices) {
    Column[] ret = new Column[indices.size()];
    int index = 0;
    for (Integer i : indices) {
        ret[index] = new Column(String.valueOf(i));
        index++;
    }
    return ret;
}
 
Example 12
Source Project: kylin-on-parquet-v2   Source File: NSparkCubingUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static Column[] getColumns(List<Integer> indices) {
    Column[] ret = new Column[indices.size()];
    int index = 0;
    for (Integer i : indices) {
        ret[index] = new Column(String.valueOf(i));
        index++;
    }
    return ret;
}
 
Example 13
Source Project: kylin-on-parquet-v2   Source File: CubeMergeJob.java    License: Apache License 2.0 5 votes vote down vote up
private void mergeSegments(String cubeId, String segmentId) throws IOException {
    CubeManager mgr = CubeManager.getInstance(config);
    CubeInstance cube = mgr.getCubeByUuid(cubeId);
    CubeSegment mergedSeg = cube.getSegmentById(segmentId);
    SegmentInfo mergedSegInfo = ManagerHub.getSegmentInfo(config, getParam(MetadataConstants.P_CUBE_ID), mergedSeg.getUuid());

    Map<Long, DFLayoutMergeAssist> mergeCuboidsAssist = generateMergeAssist(mergingSegInfos, ss);
    for (DFLayoutMergeAssist assist : mergeCuboidsAssist.values()) {
        SpanningTree spanningTree = new ForestSpanningTree(JavaConversions.asJavaCollection(mergedSegInfo.toBuildLayouts()));
        Dataset<Row> afterMerge = assist.merge(config, cube.getName());
        LayoutEntity layout = assist.getLayout();

        Dataset<Row> afterSort;
        if (layout.isTableIndex()) {
            afterSort = afterMerge.sortWithinPartitions(NSparkCubingUtil.getColumns(layout.getOrderedDimensions().keySet()));
        } else {
            Column[] dimsCols = NSparkCubingUtil.getColumns(layout.getOrderedDimensions().keySet());
            Dataset<Row> afterAgg = CuboidAggregator.agg(ss, afterMerge, layout.getOrderedDimensions().keySet(),
                    layout.getOrderedMeasures(), spanningTree, false);
            afterSort = afterAgg.sortWithinPartitions(dimsCols);
        }
        buildLayoutWithUpdate.submit(new BuildLayoutWithUpdate.JobEntity() {
            @Override
            public String getName() {
                return "merge-layout-" + layout.getId();
            }

            @Override
            public LayoutEntity build() throws IOException {
                return saveAndUpdateCuboid(afterSort, mergedSegInfo, layout, assist);
            }
        }, config);

        buildLayoutWithUpdate.updateLayout(mergedSegInfo, config);
    }
}
 
Example 14
Source Project: kylin-on-parquet-v2   Source File: CsvSourceTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetFlatTable() throws IOException {
    System.out.println(getTestConfig().getMetadataUrl());
    CubeManager cubeMgr = CubeManager.getInstance(getTestConfig());
    CubeInstance cube = cubeMgr.getCube(CUBE_NAME);
    cleanupSegments(CUBE_NAME);
    DataModelDesc model = cube.getModel();
    CubeSegment segment = cubeMgr.appendSegment(cube, new SegmentRange.TSRange(dateToLong("2010-01-01"), dateToLong("2013-01-01")));
    Dataset<Row> ds = initFlatTable(segment);
    ds.show(10);
    StructType schema = ds.schema();

    SegmentInfo segmentInfo = MetadataConverter.getSegmentInfo(segment.getCubeInstance(), segment.getUuid(),
            segment.getName(), segment.getStorageLocationIdentifier());
    scala.collection.immutable.Map<String, String> map = BuildUtils.getColumnIndexMap(segmentInfo);
    for (StructField field : schema.fields()) {
        Assert.assertNotNull(model.findColumn(map.apply(field.name())));
    }

    for (LayoutEntity layoutEntity : MetadataConverter.extractEntityList2JavaList(cube)) {
        Set<Integer> dims = layoutEntity.getOrderedDimensions().keySet();
        Column[] modelCols = new Column[dims.size()];
        int index = 0;
        for (int id : dims) {
            modelCols[index] = new Column(String.valueOf(id));
            index++;
        }
        ds.select(modelCols).show(10);
    }
}
 
Example 15
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> agg(final Column expr, final Column... exprs) {
  final boolean userTriggered = initializeFunction(expr, exprs);
  final Dataset<Row> result = from(super.agg(expr, exprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 16
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> agg(final Column expr, final scala.collection.Seq<Column> exprs) {
  final boolean userTriggered = initializeFunction(expr, exprs);
  final Dataset<Row> result = from(super.agg(expr, exprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 17
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> drop(final Column col) {
  final boolean userTriggered = initializeFunction(col);
  final Dataset<Row> result = from(super.drop(col));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 18
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> filter(final Column condition) {
  final boolean userTriggered = initializeFunction(condition);
  final Dataset<T> result = from(super.filter(condition));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 19
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs) {
  final boolean userTriggered = initializeFunction(right, joinExprs);
  final Dataset<Row> result = from(super.join(right, joinExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 20
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs, final String joinType) {
  final boolean userTriggered = initializeFunction(right, joinExprs, joinType);
  final Dataset<Row> result = from(super.join(right, joinExprs, joinType));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 21
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> orderBy(final Column... sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.orderBy(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 22
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> orderBy(final scala.collection.Seq<Column> sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.orderBy(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 23
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> repartition(final Column... partitionExprs) {
  final boolean userTriggered = initializeFunction(partitionExprs);
  final Dataset<T> result = from(super.repartition(partitionExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 24
@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (logicalOpKind == AND)
        return leftExpr.and(rightExpr);
    else if (logicalOpKind == OR)
        return leftExpr.or(rightExpr);
    else
        throw new UnsupportedOperationException();
}
 
Example 25
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> repartition(final int numPartitions, final scala.collection.Seq<Column> partitionExprs) {
  final boolean userTriggered = initializeFunction(numPartitions, partitionExprs);
  final Dataset<T> result = from(super.repartition(numPartitions, partitionExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 26
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> repartition(final scala.collection.Seq<Column> partitionExprs) {
  final boolean userTriggered = initializeFunction(partitionExprs);
  final Dataset<T> result = from(super.repartition(partitionExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 27
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> select(final Column... cols) {
  final boolean userTriggered = initializeFunction(cols);
  final Dataset<Row> result = from(super.select(cols));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 28
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> select(final scala.collection.Seq<Column> cols) {
  final boolean userTriggered = initializeFunction(cols);
  final Dataset<Row> result = from(super.select(cols));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 29
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> sort(final Column... sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.sort(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 30
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<T> sort(final scala.collection.Seq<Column> sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.sort(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}