Java Code Examples for org.apache.spark.sql.Row

The following examples show how to use org.apache.spark.sql.Row. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: DDF   Source File: SparkDataSourceManager.java    License: Apache License 2.0 8 votes vote down vote up
@Override
public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException {
    SparkDDFManager sparkDDFManager = (SparkDDFManager)mDDFManager;
    HiveContext sqlContext = sparkDDFManager.getHiveContext();

    JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials)dataSource.getDataSourceCredentials();
    String fullURL = dataSource.getDataSourceUri().getUri().toString();
    if (cred.getUsername() != null &&  !cred.getUsername().equals("")) {
        fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword());
    }

    Map<String, String> options = new HashMap<String, String>();
    options.put("url", fullURL);
    options.put("dbtable", dataSource.getDbTable());
    DataFrame df = sqlContext.load("jdbc", options);

    DDF ddf = sparkDDFManager.newDDF(sparkDDFManager, df, new Class<?>[]{DataFrame.class},
        null, SparkUtils.schemaFromDataFrame(df));
    // TODO?
    ddf.getRepresentationHandler().get(RDD.class, Row.class);
    ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource);
    return ddf;
}
 
Example 2
/**
 * 更新
 * 可以认为是,一个一个地将组内的字段值传递进来
 * 实现拼接的逻辑
 */
@Override
public void update(MutableAggregationBuffer buffer, Row input) {
	// 缓冲中的已经拼接过的城市信息串
	String bufferCityInfo = buffer.getString(0);
	// 刚刚传递进来的某个城市信息
	String cityInfo = input.getString(0);
	
	// 在这里要实现去重的逻辑
	// 判断:之前没有拼接过某个城市信息,那么这里才可以接下去拼接新的城市信息
	if(!bufferCityInfo.contains(cityInfo)) {
		if("".equals(bufferCityInfo)) {
			bufferCityInfo += cityInfo;
		} else {
			// 比如1:北京
			// 1:北京,2:上海
			bufferCityInfo += "," + cityInfo;
		}
		
		buffer.update(0, bufferCityInfo);  
	}
}
 
Example 3
Source Project: kylin   Source File: SparkUtil.java    License: Apache License 2.0 6 votes vote down vote up
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
    SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc()))
            .config(sc.getConf()).enableHiveSupport().getOrCreate();
    final Dataset intermediateTable = sparkSession.table(hiveTable);
    return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
        @Override
        public String[] call(Row row) throws Exception {
            String[] result = new String[row.size()];
            for (int i = 0; i < row.size(); i++) {
                final Object o = row.get(i);
                if (o != null) {
                    result[i] = o.toString();
                } else {
                    result[i] = null;
                }
            }
            return result;
        }
    });
}
 
Example 4
Source Project: envelope   Source File: AvroTranslator.java    License: Apache License 2.0 6 votes vote down vote up
private Row rowForRecord(GenericRecord record) {
  List<Object> values = Lists.newArrayList();

  for (Field field : record.getSchema().getFields()) {
    Object value = record.get(field.name());

    Type fieldType = field.schema().getType();
    if (fieldType.equals(Type.UNION)) {
      fieldType = field.schema().getTypes().get(1).getType();
    }
    // Avro returns Utf8s for strings, which Spark SQL doesn't know how to use
    if (fieldType.equals(Type.STRING) && value != null) {
      value = value.toString();
    }
    // Avro returns binary as a ByteBuffer, but Spark SQL wants a byte[]
    if (fieldType.equals(Type.BYTES) && value != null) {
      value = ((ByteBuffer)value).array();
    }

    values.add(value);
  }

  return new RowWithSchema(schema, values.toArray());
}
 
Example 5
Source Project: envelope   Source File: TestRangeRowRule.java    License: Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example 6
Source Project: deep-spark   Source File: DeepSparkContextTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void createJavaSchemaRDDTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = PowerMockito.mock(SQLContext.class);
    ExtractorConfig config = createDeepJobConfig();
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config);
    JavaRDD<Row> rowRDD = mock(JavaRDD.class);
    mockStatic(DeepSparkContext.class);
    when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD);
    Cells cells = mock(Cells.class);
    when(singleRdd.first()).thenReturn(cells);
    StructType schema = mock(StructType.class);
    mockStatic(CellsUtils.class);
    when(CellsUtils.getStructTypeFromCells(cells)).thenReturn(schema);

    deepSparkContextSpy.createJavaSchemaRDD(config);

    verify(sqlContext).applySchema(rowRDD, schema);
}
 
Example 7
@Override
public RDD<Row> buildScan() {
  log.debug("-> buildScan()");

  // I have isolated the work to a method to keep the plumbing code as simple
  // as
  // possible.
  List<List<Integer>> table = collectData();

  @SuppressWarnings("resource") // cannot be closed here, done elsewhere
  JavaSparkContext sparkContext = new JavaSparkContext(sqlContext
      .sparkContext());
  JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
      .map(row -> RowFactory.create(row.toArray()));

  return rowRDD.rdd();
}
 
Example 8
Source Project: envelope   Source File: TestPassthroughDeriver.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPassthrough() throws Exception {
  StructType schema = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col1", DataTypes.StringType, false)));
  Dataset<Row> dep1 = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("a")), schema);
  Dataset<Row> dep2= Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("b")), schema);
  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("dep1", dep1);
  dependencies.put("dep2", dep2);

  Deriver deriver = new PassthroughDeriver();

  List<Row> result = deriver.derive(dependencies).collectAsList();

  assertTrue(result.contains(RowFactory.create("a")));
  assertTrue(result.contains(RowFactory.create("b")));
  assertEquals(2, result.size());
}
 
Example 9
Source Project: envelope   Source File: SelectDeriver.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  dependencyCheck(dependencies);
  Dataset<Row> sourceStep = dependencies.get(stepName);
  if (useIncludeFields){
      if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){
          throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      String firstCol = includeFields.get(0);
      includeFields.remove(0);
      return sourceStep.select(firstCol, includeFields.toArray(new String[0]));
  } else {
      if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){
          throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq());
  }
}
 
Example 10
Source Project: bunsen   Source File: BundlesTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testXmlBundleStrings() {

  JavaRDD<String> xmlBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/xml/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(),
      Encoders.STRING());

  xmlBundles.write().saveAsTable("xml_bundle_table");

  JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml(
      spark.sql("select value from xml_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundles,
      Patient.class);

  checkPatients(patients);
}
 
Example 11
Source Project: ExecDashboard   Source File: PerformanceCollector.java    License: Apache License 2.0 6 votes vote down vote up
private void updateCollectorItemMetricDetail(CollectorItemMetricDetail collectorItemMetricDetail, Row row) {
    Date timeWindowDt = row.getAs(STR_TIMEWINDOW);
    List<String> performanceMetricList = Arrays.asList(STR_AVG_RESPONSE_TIME,STR_CALLSPER_MINUTE,STR_ERROR_RATE);
    GenericRowWithSchema pefMetrics = row.getAs("metrics");

    for(String perfMetric :performanceMetricList){
        double value;
        try {
            Long valueStr = pefMetrics.getAs(perfMetric);
            value = valueStr.doubleValue();
        }catch (IllegalArgumentException exception){
            value = 0.0;
        }

        MetricCount mc = getMetricCount("", value, perfMetric);
        if (!mc.getLabel().isEmpty()) {
            collectorItemMetricDetail.setStrategy(getCollectionStrategy());
            collectorItemMetricDetail.addCollectorItemMetricCount(timeWindowDt, mc);
            collectorItemMetricDetail.setLastScanDate(timeWindowDt);
        }
    }
}
 
Example 12
Source Project: systemds   Source File: RDDConverterUtils.java    License: Apache License 2.0 6 votes vote down vote up
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession,
                                                  JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc, boolean toVector)
{
	if( !mc.colsKnown() )
		throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
	
	//slice blocks into rows, align and convert into data frame rows
	JavaRDD<Row> rowsRDD = in
		.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getBlocksize()))
		.groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getBlocksize(), toVector));
	
	//create data frame schema
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
	if( toVector )
		fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
	else { // row
		for(int i = 1; i <= mc.getCols(); i++)
			fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false));
	}
	
	//rdd to data frame conversion
	return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
 
Example 13
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example 14
Source Project: mmtf-spark   Source File: StructureToInteractingResidues.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<Row> call(Tuple2<String, StructureDataInterface> t) throws Exception {
	String structureId = t._1;
	StructureDataInterface structure = t._2;
	
	List<Integer> groupIndices = new ArrayList<>();
	List<String> groupNames = new ArrayList<>();		
	getGroupIndices(structure, groupIndices, groupNames);		

	List<Row> neighbors = new ArrayList<>();
	for (int i = 0; i < groupNames.size(); i++) {
		if (groupNames.get(i).equals(groupName)) {
			List<Integer> matches = new ArrayList<>();
			float[] boundingBox = calcBoundingBox(structure, groupIndices, i, cutoffDistance);
			matches.addAll(findNeighbors(structure, i, boundingBox, groupIndices));
			neighbors.addAll(getDistanceProfile(structureId, matches, i, groupIndices, groupNames, structure));
		}
	}
	
	return neighbors.iterator();
}
 
Example 15
Source Project: SparkDemo   Source File: JavaVectorAssemblerExample.java    License: MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example 16
Source Project: nemo   Source File: DataFrameReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> json(final RDD<String> jsonRDD) {
  final boolean userTriggered = initializeFunction(jsonRDD);
  final Dataset<Row> result = Dataset.from(super.json(jsonRDD));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 17
Source Project: geowave   Source File: GeoWaveSparkSQLIT.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testSpatialJoin() throws Exception {

  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  TestUtils.testLocalIngest(
      dataStore,
      DimensionalityType.SPATIAL,
      TORNADO_TRACKS_SHAPEFILE_FILE,
      1);

  try {
    // Run a valid sql query that should do a optimized join
    queryRunner.addInputStore(dataStore, "hail", "hail");
    queryRunner.addInputStore(dataStore, "tornado_tracks", "tornado");
    queryRunner.setSql(
        "select hail.* from hail, tornado where GeomIntersects(hail.geom, tornado.geom)");
    final Dataset<Row> results = queryRunner.run();
    LOGGER.warn("Indexed intersect from sql returns: " + results.count() + " results.");
  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while attempting optimized join from sql query runner: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}
 
Example 18
Source Project: envelope   Source File: MorphlineDeriver.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  if (!dependencies.containsKey(stepName)) {
    throw new RuntimeException("Step not found in the dependencies list");
  }

  Dataset<Row> sourceStep = dependencies.get(stepName);

  // For each partition in the DataFrame / RDD
  JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap(
      MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty));

  // Convert all the Rows into a new DataFrame
  return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema());
}
 
Example 19
Source Project: systemds   Source File: RemoteDPParForSpark.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<Long, Writable> call(Tuple2<Row, Long> arg0) 
	throws Exception 
{
	long rowix = arg0._2() + 1;
	
	//process row data
	int off = _containsID ? 1: 0;
	Object obj = _isVector ? arg0._1().get(off) : arg0._1();
	boolean sparse = (obj instanceof SparseVector);
	MatrixBlock mb = new MatrixBlock(1, (int)_clen, sparse);
	
	if( _isVector ) {
		Vector vect = (Vector) obj;
		if( vect instanceof SparseVector ) {
			SparseVector svect = (SparseVector) vect;
			int lnnz = svect.numNonzeros();
			for( int k=0; k<lnnz; k++ )
				mb.appendValue(0, svect.indices()[k], svect.values()[k]);
		}
		else { //dense
			for( int j=0; j<_clen; j++ )
				mb.appendValue(0, j, vect.apply(j));	
		}
	}
	else { //row
		Row row = (Row) obj;
		for( int j=off; j<off+_clen; j++ )
			mb.appendValue(0, j-off, UtilFunctions.getDouble(row.get(j)));
	}
	mb.examSparsity();
	return new Tuple2<>(rowix, new PairWritableBlock(new MatrixIndexes(1,1),mb));
}
 
Example 20
@Test(expected=RuntimeException.class)
public void testStringIndexerForUnseenValues() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", DoubleType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, 1.0), cr(1, 2.0), cr(2, 3.0), cr(3, 1.0), cr(4, 1.0), cr(5, 3.0));
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model, dataset);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //unseen value
    Map<String, Object> data = new HashMap<String, Object>();
    data.put(model.getInputCol(), 7.0);
    transformer.transform(data);
}
 
Example 21
Source Project: mmtf-spark   Source File: AtomInteraction.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns interactions and geometric information in a single row.
 * 
 * @return row of interactions and geometric information
 */
public Row getMultipleInteractionsAsRow(int maxInteractions) {
	// pad interaction centers and distances with nulls, if necessary,
	// since each row must be of fixed length
	while (getNumInteractions() < maxInteractions) {
		neighbors.add(new InteractionCenter());
	}

	int length = InteractionCenter.getLength();

	Object[] data = new Object[getNumColumns(maxInteractions)];

	int index = 0;
	data[index++] = structureId;
	data[index++] = getNumberOfPolymerChains();
	
	calcCoordinationGeometry(maxInteractions);
	data[index++] = q3;
	data[index++] = q4;
	data[index++] = q5;
	data[index++] = q6;
	

	// copy data for query atom
	System.arraycopy(center.getAsObject(), 0, data, index, length);
	index += length;

	// copy data for interacting atoms
	for (int i = 0; i < neighbors.size(); i++) {
		System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length);
		index += length;
		data[index++] = distances[i];
	}

	// copy angles
	System.arraycopy(angles, 0, data, index, angles.length);
	index += length;

	return RowFactory.create(data);
}
 
Example 22
Source Project: net.jgp.labs.spark   Source File: S3CsvToDataset2.java    License: Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("CSV on S3 to Dataset<Row>")
      .master("spark://10.0.100.81:7077")
      .config("spark.executor.memory", "1g")
      .config("spark.executor.cores", "1")
      .config("spark.cores.max", "2")
      .config("spark.driver.host", "10.0.100.182")
      .config("spark.executor.extraClassPath",
          "/home/jgp/net.jgp.labs.spark/target/labs-spark-2.2.0-jar-with-dependencies.jar")
      .getOrCreate();

  spark.sparkContext().hadoopConfiguration().set("fs.s3a.access.key",
      "xxx");
  spark.sparkContext().hadoopConfiguration().set("fs.s3a.secret.key",
      "xxx");
  // spark.sparkContext().hadoopConfiguration().set("fs.s3n.endpoint",
  // "us-east-2");
  String bucket = "bucket_name";
  String key = "key";

  String filename = "s3a://" + bucket + "/" + key;

  Dataset<Row> df = spark.read()
      .format("csv")
      .option("inferSchema", "true")
      .option("header", "false")
      .option("sep", "|")
      .load(filename);
  df.show();
  df.printSchema();
}
 
Example 23
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    /*
    TODO: Not working yet
	// if output format is set to "csv" create both: csv and parquet 
	if(SparkImporterKafkaImportArguments.getInstance().getOutputFormat().equals(SparkImporterVariables.OUTPUT_FORMAT_CSV)) {
		dataset
        .write()
        .option("header", "true")
        .option("delimiter", ";")
        .option("ignoreLeadingWhiteSpace", "false")
        .option("ignoreTrailingWhiteSpace", "false")
        .mode(SparkImporterVariables.getSaveMode())
        .csv(SparkImporterVariables.getTargetFolder());
	}
	*/
  
	dataset
            //we repartition the data by process instances, which allows spark to better distribute the data between workers as the operations are related to a process instance
            .repartition(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID))
            .write()
            .mode(SaveMode.Append)
            .save(config.getTargetFolder());

    return dataset;
}
 
Example 24
Source Project: stocator   Source File: TestSuite.java    License: Apache License 2.0 5 votes vote down vote up
private void countAndCompare(Dataset<Row> inSpark, long readRecords, String msg) throws Exception {
  long totalInSpark = inSpark.count();
  if (totalInSpark != readRecords) {
    System.out.println("*********************************");
    System.out.println(msg + ": Records that were written into object store doesn't match");
    System.out.println(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
    throw new Exception(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  } else {
    System.out.println(
        msg + " Completed successfully. Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  }
}
 
Example 25
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {

    //check if all variables that should be hashed actually exist, otherwise log a warning
    List<String> existingColumns = new ArrayList<>(Arrays.asList(dataSet.columns()));

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(ColumnHashConfiguration chc : preprocessingConfiguration.getColumnHashConfiguration()) {
                if(chc.isHashColumn()) {
                    if(!existingColumns.contains(chc.getColumnName())) {
                        // log the fact that a column that should be hashed does not exist
                        BpmnaiLogger.getInstance().writeWarn("The column '" + chc.getColumnName() + "' is configured to be hashed, but does not exist in the data.");
                    } else {
                        dataSet = dataSet.withColumn(chc.getColumnName(), sha1(dataSet.col(chc.getColumnName())));
                        BpmnaiLogger.getInstance().writeInfo("The column '" + chc.getColumnName() + "' is being hashed.");
                    }
                }

            }
        }
    }

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "column_hash_step", config);
    }

    return dataSet;
}
 
Example 26
Source Project: incubator-nemo   Source File: Dataset.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> withColumn(final String colName, final Column col) {
  final boolean userTriggered = initializeFunction(colName, col);
  final Dataset<Row> result = from(super.withColumn(colName, col));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 27
Source Project: toolbox   Source File: DataSparkFromRDD.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public DataFrame getDataFrame(SQLContext sql) {

    // Obtain the schema
    StructType schema = SchemaConverter.getSchema(attributes);

    // Transform the RDD
    JavaRDD<Row> rowRDD = DataFrameOps.toRowRDD(amidstRDD, attributes);

    // Create the DataFrame
    return sql.createDataFrame(rowRDD, schema);
}
 
Example 28
Source Project: nemo   Source File: DataFrameReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> csv(final org.apache.spark.sql.Dataset<String> csvDataset) {
  final boolean userTriggered = initializeFunction(csvDataset);
  final Dataset<Row> result = Dataset.from(super.csv(csvDataset));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example 29
Source Project: bunsen   Source File: AbstractConceptMaps.java    License: Apache License 2.0 5 votes vote down vote up
protected AbstractConceptMaps(SparkSession spark,
    FhirVersionEnum fhirVersion,
    Dataset<UrlAndVersion> members,
    Dataset<Row> conceptMaps,
    Dataset<Mapping> mappings,
    SparkRowConverter conceptMapRowConverter) {

  this.spark = spark;
  this.fhirVersion = fhirVersion;
  this.members = members;
  this.conceptMaps = conceptMaps;
  this.mappings = mappings;
  this.conceptMapRowConverter = conceptMapRowConverter;
}
 
Example 30
Source Project: systemds   Source File: MLContextTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputDataFrameVectorsWithIDColumnFromMatrixDML() {
	System.out.println("MLContextTest - output DataFrame of vectors with ID column from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=1, cols=4);";
	Script script = dml(s).out("M");
	Dataset<Row> df = ml.execute(script).getMatrix("M").toDFVectorWithIDColumn();
	List<Row> list = df.collectAsList();

	Row row = list.get(0);
	Assert.assertEquals(1.0, row.getDouble(0), 0.0);
	Assert.assertArrayEquals(new double[] { 1.0, 2.0, 3.0, 4.0 }, ((Vector) row.get(1)).toArray(), 0.0);
}