Java Code Examples for org.apache.spark.sql.Dataset#repartition()

The following examples show how to use org.apache.spark.sql.Dataset#repartition() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamingStep.java    From envelope with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings({"unchecked","rawtypes"})
public Dataset<Row> translate(JavaRDD raw) {
  StreamInput streamInput = (StreamInput)getInput(true);
  TranslateFunction translateFunction = getTranslateFunction(config, true);

  // Encode the raw messages as rows (i.e. the raw value plus associated metadata fields)
  JavaRDD<Row> encoded = raw.map(streamInput.getMessageEncoderFunction());

  // Translate raw message rows to structured rows
  TranslationResults translationResults = new TranslationResults(
      encoded.flatMap(translateFunction),
      translateFunction.getProvidingSchema(),
      streamInput.getProvidingSchema());

  BatchStep errored = createErrorStep(getName() + DEFAULT_ERROR_DATAFRAME_SUFFIX,
      translationResults.getErrored());
  addNewBatchStep(errored);

  // Provide translated rows and errors
  Dataset<Row> translated = translationResults.getTranslated();
  if (doesRepartition()) {
    translated = translated.repartition(config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY));
  }
  return translated;
}
 
Example 2
Source File: JavaQuantileDiscretizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaQuantileDiscretizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 18.0),
    RowFactory.create(1, 19.0),
    RowFactory.create(2, 8.0),
    RowFactory.create(3, 5.0),
    RowFactory.create(4, 2.2)
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("hour", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);
  // $example off$
  // Output of QuantileDiscretizer for such small datasets can depend on the number of
  // partitions. Here we force a single partition to ensure consistent results.
  // Note this is not necessary for normal use cases
  df = df.repartition(1);
  // $example on$
  QuantileDiscretizer discretizer = new QuantileDiscretizer()
    .setInputCol("hour")
    .setOutputCol("result")
    .setNumBuckets(3);

  Dataset<Row> result = discretizer.fit(df).transform(df);
  result.show();
  // $example off$
  spark.stop();
}
 
Example 3
Source File: BatchStep.java    From envelope with Apache License 2.0 5 votes vote down vote up
private Dataset<Row> repartition(Dataset<Row> data) {
  int numPartitions = 0;
  List<String> colPartitions = null;

  if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY);
  }

  if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) {
    colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY);
  }

  if (numPartitions > 0 && null != colPartitions) {
    data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions));
  }
  else if (numPartitions > 0) {
    data = data.repartition(numPartitions);
  }
  else if (null != colPartitions) {
    data = data.repartition(RowUtils.toColumnArray(colPartitions));
  }

  if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY);
    data = data.coalesce(numPartitions);
  }
  
  return data;
}
 
Example 4
Source File: DataFrameMatrixConversionTest.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
	boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; 
	ExecMode oldPlatform = DMLScript.getGlobalExecMode();

	try
	{
		DMLScript.USE_LOCAL_SPARK_CONFIG = true;
		DMLScript.setGlobalExecMode(ExecMode.HYBRID);
		
		//generate input data and setup metadata
		int rows = (cols == cols3) ? rows3 : rows1;
		double sparsity = dense ? sparsity1 : sparsity2; 
		double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373); 
		MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); 
		int blksz = ConfigurationManager.getBlocksize();
		MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, mbA.getNonZeros());
		MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);

		//get binary block input rdd
		JavaPairRDD<MatrixIndexes,MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz);
		
		//matrix - dataframe - matrix conversion
		Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
		df = ( rows==rows3 ) ? df.repartition(rows) : df;
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
		
		//get output matrix block
		MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, -1);
		
		//compare matrix blocks
		double[][] B = DataConverter.convertToDoubleMatrix(mbB);
		TestUtils.compareMatrices(A, B, rows, cols, eps);
	}
	catch( Exception ex ) {
		throw new RuntimeException(ex);
	}
	finally {
		DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
		DMLScript.setGlobalExecMode(oldPlatform);
	}
}
 
Example 5
Source File: Runner.java    From stocator with Apache License 2.0 4 votes vote down vote up
private static void executeTestSuite(NameGenerator nameGenerator,
    SparkSession spark) throws Exception{
  TestSuite testSuite = new TestSuite(dataCreate, flatListing);

  System.out.println("*********************************");
  System.out.println("*** Create dataframe from the local CSV file ***");
  Dataset<Row> schemaFlights = testSuite.getFlights(spark, csvLocalPath);
  
  nameGenerator.generateObjectNames();
  if (dataCreate) {
      System.out.println("Data cleanup (start) for " + nameGenerator.getContainerPath() + "*");
      System.out.println("*********************************");
      testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), false);
      System.out.println("*********************************");
  }

  testSuite.test1(spark, schemaFlights, nameGenerator.getCsvPath2());
  testSuite.test2(spark, schemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE);
  testSuite.test2(spark, schemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE);
  testSuite.test3(spark, schemaFlights, nameGenerator.getCsvPath1());
  testSuite.test4(spark, nameGenerator.getTxtPath());
  testSuite.test8(spark, nameGenerator.getTxtPath(), isTimeOutTest );

  if (isSwift) {
    nameGenerator.generateNewContainer("list");
    System.out.println("Data cleanup for " + nameGenerator.getContainerPath() + "*");
    System.out.println("*********************************");
    testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), dataCreate);
    System.out.println("*********************************");
  }
  testSuite.test6(spark, schemaFlights, nameGenerator.getContainerPath(), nameGenerator.getCsvName());
  if (isSwift) {
    nameGenerator.generateNewContainer(false);
    System.out.println("Data cleanup for " + nameGenerator.getContainerPath() + "*");
    System.out.println("*********************************");
    testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), dataCreate);
    System.out.println("*********************************");
  }
  
  testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.TEXT_TYPE);
  testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.TEXT_TYPE);
  testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test10(spark, nameGenerator.getDataResPath() + "/dfp");
  testSuite.test11(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test12(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test9(spark, nameGenerator.getDataResPath());
  testSuite.test13(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.CSV_TYPE);
  testSuite.test14(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test14(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test15(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test15(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test16(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test16(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  
  if (csvLocalLargePath != null) {
    System.out.println("*********************************");
    System.out.println("Large file test!");
    Dataset<Row> largeSchemaFlights = testSuite.getFlights(spark, csvLocalLargePath);
    if (isSwift) {
      nameGenerator.generateNewContainer(true);
    }
    testSuite.test1(spark, largeSchemaFlights, nameGenerator.getCsvPath2());
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE);
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE);
    System.out.println("***** Repartition to 1");
    largeSchemaFlights.repartition(1);
    if (isSwift) {
      nameGenerator.generateNewContainer(true);
    }
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE);
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE);
  } else {
    System.out.println("*********************************");
    System.out.println("No large file test to be executed");
  }

}
 
Example 6
Source File: DataFrameMatrixConversionTest.java    From systemds with Apache License 2.0 4 votes vote down vote up
private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
	boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; 
	ExecMode oldPlatform = DMLScript.getGlobalExecMode();

	try
	{
		DMLScript.USE_LOCAL_SPARK_CONFIG = true;
		DMLScript.setGlobalExecMode(ExecMode.HYBRID);
		
		//generate input data and setup metadata
		int rows = (cols == cols3) ? rows3 : rows1;
		double sparsity = dense ? sparsity1 : sparsity2; 
		double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373); 
		MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); 
		int blksz = ConfigurationManager.getBlocksize();
		MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, mbA.getNonZeros());
		MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);

		//get binary block input rdd
		JavaPairRDD<MatrixIndexes,MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz);
		
		//matrix - dataframe - matrix conversion
		Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
		df = ( rows==rows3 ) ? df.repartition(rows) : df;
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
		
		//get output matrix block
		MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, -1);
		
		//compare matrix blocks
		double[][] B = DataConverter.convertToDoubleMatrix(mbB);
		TestUtils.compareMatrices(A, B, rows, cols, eps);
	}
	catch( Exception ex ) {
		throw new RuntimeException(ex);
	}
	finally {
		DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
		DMLScript.setGlobalExecMode(oldPlatform);
	}
}
 
Example 7
Source File: SparkDataSet.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeAvroFile(DataSetProcessor dsp,
                                      int[] partitionBy,
                                      String location,
                                      String compression,
                                      OperationContext context) throws StandardException
{
    compression = SparkDataSet.getAvroCompression(compression);

    StructType dataSchema = null;
    StructType tableSchema = generateTableSchema(context);

    // what is this? why is this so different from parquet/orc ?
    // actually very close to NativeSparkDataSet.writeFile
    dataSchema = ExternalTableUtils.getDataSchema(dsp, tableSchema, partitionBy, location, "a");

    if (dataSchema == null)
        dataSchema = tableSchema;

    Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
            rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowAvroFunction()),
            dataSchema);


    // We duplicate the code in NativeSparkDataset.writeAvroFile here to avoid calling  ExternalTableUtils.getDataSchema() twice
    List<String> partitionByCols = new ArrayList();
    for (int i = 0; i < partitionBy.length; i++) {
        partitionByCols.add(dataSchema.fields()[partitionBy[i]].name());
    }
    if (partitionBy.length > 0) {
        List<Column> repartitionCols = new ArrayList();
        for (int i = 0; i < partitionBy.length; i++) {
            repartitionCols.add(new Column(dataSchema.fields()[partitionBy[i]].name()));
        }
        insertDF = insertDF.repartition(scala.collection.JavaConversions.asScalaBuffer(repartitionCols).toList());
    }
    if (compression.equals("none")) {
        compression = "uncompressed";
    }
    insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
            .mode(SaveMode.Append).format("com.databricks.spark.avro").save(location);
    ValueRow valueRow=new ValueRow(1);
    valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
    return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
}