Java Code Examples for org.apache.spark.sql.Dataset#columns()

The following examples show how to use org.apache.spark.sql.Dataset#columns() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkCubingJobTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
        org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example 2
Source File: NManualBuildAndQueryCuboidTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example 3
Source File: CsvToDatasetCompatibleWithSparkv1x.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("CSV to Dataset")
      .master("local")
      .getOrCreate();

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv")
      .option("inferSchema", "true")
      .option("header", "false")
      .load(filename);
  df.show();

  // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
  int count = df.columns().length;
  for (int i = 0; i < count; i++) {
    String oldColName = "_c" + i;
    String newColName = "C" + i;
    df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
  }
  df.show();
}
 
Example 4
Source File: WriteToDiscStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {
	
    // remove spaces from column names as parquet does not support them
    for(String columnName : dataset.columns()) {
        if(columnName.contains(" ")) {
            String newColumnName = columnName.replace(' ', '_');
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }

    dataset.cache();
    BpmnaiUtils.getInstance().writeDatasetToParquet(dataset, "result", config);

    if(config.isGenerateResultPreview()) {
        dataset.limit(config.getResultPreviewLineCount()).write().mode(SaveMode.Overwrite).saveAsTable(BpmnaiVariables.RESULT_PREVIEW_TEMP_TABLE);
    }

    return dataset;
}
 
Example 5
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}
 
Example 6
Source File: SparkCubingJobTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example 7
Source File: DataframeUtils.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
public static Dataset<Row> addMetadata(Dataset<Row> df, String key,
    String value) {
  for (String colName : df.columns()) {
    df = addMetadata(df, colName, key, value);
  }
  return df;
}
 
Example 8
Source File: DrugBankDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Removes spaces from column names to ensure compatibility with parquet
 * files.
 *
 * @param original
 *            dataset
 * @return dataset with columns renamed
 */
private static Dataset<Row> removeSpacesFromColumnNames(Dataset<Row> original) {

    for (String existingName : original.columns()) {
        String newName = existingName.replaceAll(" ", "");
        original = original.withColumnRenamed(existingName, newName);
    }

    return original;
}
 
Example 9
Source File: G2SDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Downloads PDB residue mappings for a list of genomic variations.
 * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C)
 * @param pdbId specific PDB structure used for mapping
 * @param chainId specific chain used for mapping
 * @return dataset with PDB mapping information
 * @throws IOException
 */
private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();    
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); 

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData); 
    dataset.show();
    
    // return null if dataset is empty
    if (dataset.columns().length == 0) {
        System.out.println("G2SDataset: no matches found");
        return null;
    }   
       
    dataset = standardizeData(dataset);
    
    return flattenDataset(dataset);
}
 
Example 10
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}
 
Example 11
Source File: NManualBuildAndQueryCuboidTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}
 
Example 12
Source File: SparkRegressor.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String,String> fit(Dataset<Row> data) {

	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed);
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];

	// Train a RandomForest model.
	predictor
	  .setLabelCol(label)
	  .setFeaturesCol("features");

	// Chain indexer and forest in a Pipeline
	Pipeline pipeline = new Pipeline()
	  .setStages(new PipelineStage[] {predictor});

	// Train model. This also runs the indexer.
	PipelineModel model = pipeline.fit(trainingData);

	// Make predictions.
	Dataset<Row> predictions = model.transform(testData);

	// Display some sample predictions
	System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
	String primaryKey = predictions.columns()[0];
	predictions.select(primaryKey, label, "prediction").sample(false, 0.1, seed).show(50);
	
	Map<String,String> metrics = new LinkedHashMap<>();
        
    metrics.put("Method", predictor.getClass().getSimpleName());
    
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator()
 		  .setLabelCol(label)
 		  .setPredictionCol("prediction")
 		  .setMetricName("rmse");
    
    metrics.put("rmse", Double.toString(evaluator.evaluate(predictions)));

	return metrics;
}
 
Example 13
Source File: AggregateActivityInstancesStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    //apply first and processState aggregator
    Map<String, String> aggregationMap = new HashMap<>();
    for(String column : dataset.columns()) {
        if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) {
            continue;
        } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) {
            aggregationMap.put(column, "max");
        } else if(column.equals(BpmnaiVariables.VAR_STATE)) {
            aggregationMap.put(column, "ProcessState");
        } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) {
            //ignore it, as we aggregate by it
            continue;
        } else {
            aggregationMap.put(column, "AllButEmptyString");
        }
    }

    //first aggregation
    //activity level, take only processInstance and activityInstance rows
    dataset = dataset
            .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE))
            .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID)
            .agg(aggregationMap);

    //rename back columns after aggregation
    String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)";
    Pattern r = Pattern.compile(pattern);

    for(String columnName : dataset.columns()) {
        Matcher m = r.matcher(columnName);
        if(m.find()) {
            String newColumnName = m.group(2);
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }


    //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone
    dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE);

    dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME);

    dataset.cache();
    BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances.");

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config);
    }

    //return preprocessed data
    return dataset;
}
 
Example 14
Source File: InListDeriver.java    From envelope with Apache License 2.0 4 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {

  Dataset<Row> target = getStepDataFrame(dependencies);
  if (target.columns().length < 1) {
    throw new RuntimeException("Targeted step, '" + stepName + ",' has no columns");
  }

  try {
    String targetField = fieldName == null ? target.columns()[0] : fieldName;
    Column targetColumn = target.col(targetField);

    LOGGER.debug("Targeting '{}[{}]'", stepName, targetField);

    // If the IN list is inline, there is no batch
    if (inList != null) {
      LOGGER.debug("IN list is inline");
      return target.filter(targetColumn.isin(inList.toArray()));
    }

    // Otherwise, collect the values from the reference, executed within the batch
    else {
      LOGGER.trace("IN list is a reference");
      Dataset<Row> reference = dependencies.get(refStepName);
      String referenceField = refFieldName == null ? reference.columns()[0] : refFieldName;

      LOGGER.debug("Referencing using {}[{}]", refStepName, referenceField);
      Column referenceColumn = reference.col(referenceField);

      Iterator<Row> referenceIterator = reference.select(referenceColumn).distinct().toLocalIterator();
      this.inList = new ArrayList<>();
      long counter = 0;

      // Set up the batch collector
      JavaRDD<Row> unionRDD = new JavaSparkContext(Contexts.getSparkSession().sparkContext()).emptyRDD();
      Dataset<Row> union = Contexts.getSparkSession().createDataFrame(unionRDD, target.schema());

      while (referenceIterator.hasNext()) {
        // Flush the batch
        if (counter == batchSize) {
          LOGGER.trace("Flushing batch");
          union = union.union(target.filter(targetColumn.isin(inList.toArray())));
          inList.clear();
          counter = 0L;
        }

        // Gather the elements of the IN list from the reference
        inList.add(referenceIterator.next().get(0));
        counter++;
      }

      // If the selection is under the batch threshold
      if (union.rdd().isEmpty()) {
        return target.filter(targetColumn.isin(inList.toArray()));
      }

      // Flush any remaining IN list values
      else {
        return union.union(target.filter(targetColumn.isin(inList.toArray())));
      }
    }
  } catch (Throwable ae) {
    throw new RuntimeException("Error executing IN list filtering", ae);
  }

}
 
Example 15
Source File: TypeCastStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    // get variables
    Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED);

    List<StructField> datasetFields = Arrays.asList(dataset.schema().fields());

    List<ColumnConfiguration> columnConfigurations = null;
    List<VariableConfiguration> variableConfigurations = null;

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        columnConfigurations = preprocessingConfiguration.getColumnConfiguration();
        variableConfigurations = preprocessingConfiguration.getVariableConfiguration();
    }

    Map<String, ColumnConfiguration> columnTypeConfigMap = new HashMap<>();
    Map<String, VariableConfiguration> variableTypeConfigMap = new HashMap<>();

    if(columnConfigurations != null) {
        for(ColumnConfiguration cc : columnConfigurations) {
            columnTypeConfigMap.put(cc.getColumnName(), cc);
        }
    }

    if(variableConfigurations != null) {
        for(VariableConfiguration vc : variableConfigurations) {
            variableTypeConfigMap.put(vc.getVariableName(), vc);
        }
    }

    for(String column : dataset.columns()) {

        // skip revision columns as they are handled for each variable column
        if(column.endsWith("_rev")) {
            continue;
        }

        DataType newDataType = null;
        boolean isVariableColumn  = false;
        String configurationDataType = null;
        String configurationParseFormat = null;

        if(variableTypeConfigMap.keySet().contains(column)) {
            // was initially a variable
            configurationDataType = variableTypeConfigMap.get(column).getVariableType();
            configurationParseFormat = variableTypeConfigMap.get(column).getParseFormat();
            if (config.getPipelineMode().equals(BpmnaiVariables.PIPELINE_MODE_LEARN)) {
                isVariableColumn = varMap.keySet().contains(column);
            } else {
                isVariableColumn = true;
            }
        } else if(columnTypeConfigMap.keySet().contains(column)){
            // was initially a column
            configurationDataType = columnTypeConfigMap.get(column).getColumnType();
            configurationParseFormat = columnTypeConfigMap.get(column).getParseFormat();
        }

        newDataType = mapDataType(datasetFields, column, configurationDataType);

        // only check for cast errors if dev feature is enabled and if a change in the datatype has been done
        if(config.isDevTypeCastCheckEnabled() && !newDataType.equals(getCurrentDataType(datasetFields, column))) {
            // add a column with casted value to be able to check the cast results
            dataset = castColumn(dataset, column, column+"_casted", newDataType, configurationParseFormat);

            // add a column for cast results and write CAST_ERROR? in it if there might be a cast error
            dataset = dataset.withColumn(column+"_castresult",
                    when(dataset.col(column).isNotNull().and(dataset.col(column).notEqual(lit(""))),
                            when(dataset.col(column+"_casted").isNull(), lit("CAST_ERROR?"))
                                    .otherwise(lit(""))
                    ).otherwise(lit(""))
            );
            dataset.cache();

            // check for cast errors and write warning to application log
            if(dataset.filter(column+"_castresult == 'CAST_ERROR?'").count() > 0) {
                BpmnaiLogger.getInstance().writeWarn("Column '" + column + "' seems to have cast errors. Please check the data type (is defined as '" + configurationDataType + "')");
            } else {
                // drop help columns as there are no cast errors for this column and rename casted column to actual column name
                dataset = dataset.drop(column, column+"_castresult").withColumnRenamed(column+"_casted", column);
            }
        } else {
            // cast without checking the cast result, entries are null is spark can't cast it
            dataset = castColumn(dataset, column, column, newDataType, configurationParseFormat);
        }

        // cast revision columns for former variables, revisions columns only exist on process level
        if(config.getDataLevel().equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled() && isVariableColumn) {
            dataset = dataset.withColumn(column+"_rev", dataset.col(column+"_rev").cast("integer"));
        }
    }

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "type_cast_columns", config);
    }

    //return preprocessed data
    return dataset;
}
 
Example 16
Source File: Spark3Shims.java    From zeppelin with Apache License 2.0 4 votes vote down vote up
@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
  if (obj instanceof Dataset) {
    Dataset<Row> df = ((Dataset) obj).toDF();
    String[] columns = df.columns();
    // DDL will empty DataFrame
    if (columns.length == 0) {
      return "";
    }
    // fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
    List<Row> rows = df.takeAsList(maxResult + 1);
    String template = context.getLocalProperties().get("template");
    if (!StringUtils.isBlank(template)) {
      if (rows.size() >= 1) {
        return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
      } else {
        return "";
      }
    }

    StringBuilder msg = new StringBuilder();
    msg.append("%table ");
    msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
    msg.append("\n");
    boolean isLargerThanMaxResult = rows.size() > maxResult;
    if (isLargerThanMaxResult) {
      rows = rows.subList(0, maxResult);
    }
    for (Row row : rows) {
      for (int i = 0; i < row.size(); ++i) {
        msg.append(TableDataUtils.normalizeColumn(row.get(i)));
        if (i != row.size() -1) {
          msg.append("\t");
        }
      }
      msg.append("\n");
    }

    if (isLargerThanMaxResult) {
      msg.append("\n");
      msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
    }
    // append %text at the end, otherwise the following output will be put in table as well.
    msg.append("\n%text ");
    return msg.toString();
  } else {
    return obj.toString();
  }
}
 
Example 17
Source File: Spark2Shims.java    From zeppelin with Apache License 2.0 4 votes vote down vote up
@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
  if (obj instanceof Dataset) {
    Dataset<Row> df = ((Dataset) obj).toDF();
    String[] columns = df.columns();
    // DDL will empty DataFrame
    if (columns.length == 0) {
      return "";
    }
    // fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
    List<Row> rows = df.takeAsList(maxResult + 1);
    String template = context.getLocalProperties().get("template");
    if (!StringUtils.isBlank(template)) {
      if (rows.size() >= 1) {
        return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
      } else {
        return "";
      }
    }

    StringBuilder msg = new StringBuilder();
    msg.append("\n%table ");
    msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
    msg.append("\n");
    boolean isLargerThanMaxResult = rows.size() > maxResult;
    if (isLargerThanMaxResult) {
      rows = rows.subList(0, maxResult);
    }
    for (Row row : rows) {
      for (int i = 0; i < row.size(); ++i) {
        msg.append(TableDataUtils.normalizeColumn(row.get(i)));
        if (i != row.size() -1) {
          msg.append("\t");
        }
      }
      msg.append("\n");
    }

    if (isLargerThanMaxResult) {
      msg.append("\n");
      msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
    }
    // append %text at the end, otherwise the following output will be put in table as well.
    msg.append("\n%text ");
    return msg.toString();
  } else {
    return obj.toString();
  }
}