org.apache.spark.sql.Row Java Examples

The following examples show how to use org.apache.spark.sql.Row. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkDataSourceManager.java    From DDF with Apache License 2.0 8 votes vote down vote up
@Override
public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException {
    SparkDDFManager sparkDDFManager = (SparkDDFManager)mDDFManager;
    HiveContext sqlContext = sparkDDFManager.getHiveContext();

    JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials)dataSource.getDataSourceCredentials();
    String fullURL = dataSource.getDataSourceUri().getUri().toString();
    if (cred.getUsername() != null &&  !cred.getUsername().equals("")) {
        fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword());
    }

    Map<String, String> options = new HashMap<String, String>();
    options.put("url", fullURL);
    options.put("dbtable", dataSource.getDbTable());
    DataFrame df = sqlContext.load("jdbc", options);

    DDF ddf = sparkDDFManager.newDDF(sparkDDFManager, df, new Class<?>[]{DataFrame.class},
        null, SparkUtils.schemaFromDataFrame(df));
    // TODO?
    ddf.getRepresentationHandler().get(RDD.class, Row.class);
    ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource);
    return ddf;
}
 
Example #2
Source File: GroupConcatDistinctUDAF.java    From BigDataPlatform with GNU General Public License v3.0 6 votes vote down vote up
/**
 * 更新
 * 可以认为是,一个一个地将组内的字段值传递进来
 * 实现拼接的逻辑
 */
@Override
public void update(MutableAggregationBuffer buffer, Row input) {
	// 缓冲中的已经拼接过的城市信息串
	String bufferCityInfo = buffer.getString(0);
	// 刚刚传递进来的某个城市信息
	String cityInfo = input.getString(0);
	
	// 在这里要实现去重的逻辑
	// 判断:之前没有拼接过某个城市信息,那么这里才可以接下去拼接新的城市信息
	if(!bufferCityInfo.contains(cityInfo)) {
		if("".equals(bufferCityInfo)) {
			bufferCityInfo += cityInfo;
		} else {
			// 比如1:北京
			// 1:北京,2:上海
			bufferCityInfo += "," + cityInfo;
		}
		
		buffer.update(0, bufferCityInfo);  
	}
}
 
Example #3
Source File: SparkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) {
    SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc()))
            .config(sc.getConf()).enableHiveSupport().getOrCreate();
    final Dataset intermediateTable = sparkSession.table(hiveTable);
    return intermediateTable.javaRDD().map(new Function<Row, String[]>() {
        @Override
        public String[] call(Row row) throws Exception {
            String[] result = new String[row.size()];
            for (int i = 0; i < row.size(); i++) {
                final Object o = row.get(i);
                if (o != null) {
                    result[i] = o.toString();
                } else {
                    result[i] = null;
                }
            }
            return result;
        }
    });
}
 
Example #4
Source File: AvroTranslator.java    From envelope with Apache License 2.0 6 votes vote down vote up
private Row rowForRecord(GenericRecord record) {
  List<Object> values = Lists.newArrayList();

  for (Field field : record.getSchema().getFields()) {
    Object value = record.get(field.name());

    Type fieldType = field.schema().getType();
    if (fieldType.equals(Type.UNION)) {
      fieldType = field.schema().getTypes().get(1).getType();
    }
    // Avro returns Utf8s for strings, which Spark SQL doesn't know how to use
    if (fieldType.equals(Type.STRING) && value != null) {
      value = value.toString();
    }
    // Avro returns binary as a ByteBuffer, but Spark SQL wants a byte[]
    if (fieldType.equals(Type.BYTES) && value != null) {
      value = ((ByteBuffer)value).array();
    }

    values.add(value);
  }

  return new RowWithSchema(schema, values.toArray());
}
 
Example #5
Source File: TestRangeRowRule.java    From envelope with Apache License 2.0 6 votes vote down vote up
public void testDontIgnoreNulls() {
  StructType schema = new StructType(new StructField[] {
      new StructField("name", DataTypes.StringType, false, Metadata.empty()),
      new StructField("nickname", DataTypes.StringType, false, Metadata.empty()),
      new StructField("age", DataTypes.IntegerType, false, Metadata.empty()),
      new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty())
  });

  Map<String, Object> configMap = new HashMap<>();
  configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age"));
  configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int");
  configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105));
  Config config = ConfigFactory.parseMap(configMap);

  RangeRowRule rule = new RangeRowRule();
  assertNoValidationFailures(rule, config);
  rule.configure(config);
  rule.configureName("agerange");

  Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00"));
  assertFalse("Row should not pass rule", rule.check(row1));
}
 
Example #6
Source File: DeepSparkContextTest.java    From deep-spark with Apache License 2.0 6 votes vote down vote up
@Test
public void createJavaSchemaRDDTest() throws Exception {
    deepSparkContext = createDeepSparkContext();
    DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
    SQLContext sqlContext = PowerMockito.mock(SQLContext.class);
    ExtractorConfig config = createDeepJobConfig();
    Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
    Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
    PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config);
    JavaRDD<Row> rowRDD = mock(JavaRDD.class);
    mockStatic(DeepSparkContext.class);
    when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD);
    Cells cells = mock(Cells.class);
    when(singleRdd.first()).thenReturn(cells);
    StructType schema = mock(StructType.class);
    mockStatic(CellsUtils.class);
    when(CellsUtils.getStructTypeFromCells(cells)).thenReturn(schema);

    deepSparkContextSpy.createJavaSchemaRDD(config);

    verify(sqlContext).applySchema(rowRDD, schema);
}
 
Example #7
Source File: SubStringCounterRelation.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
@Override
public RDD<Row> buildScan() {
  log.debug("-> buildScan()");

  // I have isolated the work to a method to keep the plumbing code as simple
  // as
  // possible.
  List<List<Integer>> table = collectData();

  @SuppressWarnings("resource") // cannot be closed here, done elsewhere
  JavaSparkContext sparkContext = new JavaSparkContext(sqlContext
      .sparkContext());
  JavaRDD<Row> rowRDD = sparkContext.parallelize(table)
      .map(row -> RowFactory.create(row.toArray()));

  return rowRDD.rdd();
}
 
Example #8
Source File: TestPassthroughDeriver.java    From envelope with Apache License 2.0 6 votes vote down vote up
@Test
public void testPassthrough() throws Exception {
  StructType schema = DataTypes.createStructType(Lists.<StructField>newArrayList(
      DataTypes.createStructField("col1", DataTypes.StringType, false)));
  Dataset<Row> dep1 = Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("a")), schema);
  Dataset<Row> dep2= Contexts.getSparkSession().createDataFrame(
      Lists.newArrayList(RowFactory.create("b")), schema);
  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("dep1", dep1);
  dependencies.put("dep2", dep2);

  Deriver deriver = new PassthroughDeriver();

  List<Row> result = deriver.derive(dependencies).collectAsList();

  assertTrue(result.contains(RowFactory.create("a")));
  assertTrue(result.contains(RowFactory.create("b")));
  assertEquals(2, result.size());
}
 
Example #9
Source File: BundlesTest.java    From bunsen with Apache License 2.0 6 votes vote down vote up
@Test
public void testXmlBundleStrings() {

  JavaRDD<String> xmlBundlesRdd = spark.sparkContext()
      .wholeTextFiles("src/test/resources/xml/bundles", 1)
      .toJavaRDD()
      .map(tuple -> tuple._2());

  Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(),
      Encoders.STRING());

  xmlBundles.write().saveAsTable("xml_bundle_table");

  JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml(
      spark.sql("select value from xml_bundle_table"), "value");

  Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark,
      bundles,
      Patient.class);

  checkPatients(patients);
}
 
Example #10
Source File: PerformanceCollector.java    From ExecDashboard with Apache License 2.0 6 votes vote down vote up
private void updateCollectorItemMetricDetail(CollectorItemMetricDetail collectorItemMetricDetail, Row row) {
    Date timeWindowDt = row.getAs(STR_TIMEWINDOW);
    List<String> performanceMetricList = Arrays.asList(STR_AVG_RESPONSE_TIME,STR_CALLSPER_MINUTE,STR_ERROR_RATE);
    GenericRowWithSchema pefMetrics = row.getAs("metrics");

    for(String perfMetric :performanceMetricList){
        double value;
        try {
            Long valueStr = pefMetrics.getAs(perfMetric);
            value = valueStr.doubleValue();
        }catch (IllegalArgumentException exception){
            value = 0.0;
        }

        MetricCount mc = getMetricCount("", value, perfMetric);
        if (!mc.getLabel().isEmpty()) {
            collectorItemMetricDetail.setStrategy(getCollectionStrategy());
            collectorItemMetricDetail.addCollectorItemMetricCount(timeWindowDt, mc);
            collectorItemMetricDetail.setLastScanDate(timeWindowDt);
        }
    }
}
 
Example #11
Source File: RDDConverterUtils.java    From systemds with Apache License 2.0 6 votes vote down vote up
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession,
                                                  JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc, boolean toVector)
{
	if( !mc.colsKnown() )
		throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
	
	//slice blocks into rows, align and convert into data frame rows
	JavaRDD<Row> rowsRDD = in
		.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getBlocksize()))
		.groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getBlocksize(), toVector));
	
	//create data frame schema
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
	if( toVector )
		fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
	else { // row
		for(int i = 1; i <= mc.getCols(); i++)
			fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false));
	}
	
	//rdd to data frame conversion
	return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
 
Example #12
Source File: NManualBuildAndQueryCuboidTest.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}
 
Example #13
Source File: StructureToInteractingResidues.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
@Override
public Iterator<Row> call(Tuple2<String, StructureDataInterface> t) throws Exception {
	String structureId = t._1;
	StructureDataInterface structure = t._2;
	
	List<Integer> groupIndices = new ArrayList<>();
	List<String> groupNames = new ArrayList<>();		
	getGroupIndices(structure, groupIndices, groupNames);		

	List<Row> neighbors = new ArrayList<>();
	for (int i = 0; i < groupNames.size(); i++) {
		if (groupNames.get(i).equals(groupName)) {
			List<Integer> matches = new ArrayList<>();
			float[] boundingBox = calcBoundingBox(structure, groupIndices, i, cutoffDistance);
			matches.addAll(findNeighbors(structure, i, boundingBox, groupIndices));
			neighbors.addAll(getDistanceProfile(structureId, matches, i, groupIndices, groupNames, structure));
		}
	}
	
	return neighbors.iterator();
}
 
Example #14
Source File: JavaVectorAssemblerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example #15
Source File: DataFrameReader.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> json(final RDD<String> jsonRDD) {
  final boolean userTriggered = initializeFunction(jsonRDD);
  final Dataset<Row> result = Dataset.from(super.json(jsonRDD));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #16
Source File: GeoWaveSparkSQLIT.java    From geowave with Apache License 2.0 5 votes vote down vote up
@Test
public void testSpatialJoin() throws Exception {

  // Set up Spark
  final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession();

  final SqlQueryRunner queryRunner = new SqlQueryRunner();
  queryRunner.setSparkSession(session);

  // ingest test points
  TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1);

  TestUtils.testLocalIngest(
      dataStore,
      DimensionalityType.SPATIAL,
      TORNADO_TRACKS_SHAPEFILE_FILE,
      1);

  try {
    // Run a valid sql query that should do a optimized join
    queryRunner.addInputStore(dataStore, "hail", "hail");
    queryRunner.addInputStore(dataStore, "tornado_tracks", "tornado");
    queryRunner.setSql(
        "select hail.* from hail, tornado where GeomIntersects(hail.geom, tornado.geom)");
    final Dataset<Row> results = queryRunner.run();
    LOGGER.warn("Indexed intersect from sql returns: " + results.count() + " results.");
  } catch (final Exception e) {
    e.printStackTrace();
    TestUtils.deleteAll(dataStore);
    Assert.fail(
        "Error occurred while attempting optimized join from sql query runner: '"
            + e.getLocalizedMessage()
            + "'");
  }

  // Clean up
  TestUtils.deleteAll(dataStore);
}
 
Example #17
Source File: MorphlineDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  if (!dependencies.containsKey(stepName)) {
    throw new RuntimeException("Step not found in the dependencies list");
  }

  Dataset<Row> sourceStep = dependencies.get(stepName);

  // For each partition in the DataFrame / RDD
  JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap(
      MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty));

  // Convert all the Rows into a new DataFrame
  return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema());
}
 
Example #18
Source File: RemoteDPParForSpark.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Override
public Tuple2<Long, Writable> call(Tuple2<Row, Long> arg0) 
	throws Exception 
{
	long rowix = arg0._2() + 1;
	
	//process row data
	int off = _containsID ? 1: 0;
	Object obj = _isVector ? arg0._1().get(off) : arg0._1();
	boolean sparse = (obj instanceof SparseVector);
	MatrixBlock mb = new MatrixBlock(1, (int)_clen, sparse);
	
	if( _isVector ) {
		Vector vect = (Vector) obj;
		if( vect instanceof SparseVector ) {
			SparseVector svect = (SparseVector) vect;
			int lnnz = svect.numNonzeros();
			for( int k=0; k<lnnz; k++ )
				mb.appendValue(0, svect.indices()[k], svect.values()[k]);
		}
		else { //dense
			for( int j=0; j<_clen; j++ )
				mb.appendValue(0, j, vect.apply(j));	
		}
	}
	else { //row
		Row row = (Row) obj;
		for( int j=off; j<off+_clen; j++ )
			mb.appendValue(0, j-off, UtilFunctions.getDouble(row.get(j)));
	}
	mb.examSparsity();
	return new Tuple2<>(rowix, new PairWritableBlock(new MatrixIndexes(1,1),mb));
}
 
Example #19
Source File: StringIndexerBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test(expected=RuntimeException.class)
public void testStringIndexerForUnseenValues() {

    //prepare data
    StructType schema = createStructType(new StructField[]{
            createStructField("id", IntegerType, false),
            createStructField("label", DoubleType, false)
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, 1.0), cr(1, 2.0), cr(2, 3.0), cr(3, 1.0), cr(4, 1.0), cr(5, 3.0));
    DataFrame dataset = sqlContext.createDataFrame(trainingData, schema);

    //train model in spark
    StringIndexerModel model = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex").fit(dataset);

    //Export this model
    byte[] exportedModel = ModelExporter.export(model, dataset);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //unseen value
    Map<String, Object> data = new HashMap<String, Object>();
    data.put(model.getInputCol(), 7.0);
    transformer.transform(data);
}
 
Example #20
Source File: AtomInteraction.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Returns interactions and geometric information in a single row.
 * 
 * @return row of interactions and geometric information
 */
public Row getMultipleInteractionsAsRow(int maxInteractions) {
	// pad interaction centers and distances with nulls, if necessary,
	// since each row must be of fixed length
	while (getNumInteractions() < maxInteractions) {
		neighbors.add(new InteractionCenter());
	}

	int length = InteractionCenter.getLength();

	Object[] data = new Object[getNumColumns(maxInteractions)];

	int index = 0;
	data[index++] = structureId;
	data[index++] = getNumberOfPolymerChains();
	
	calcCoordinationGeometry(maxInteractions);
	data[index++] = q3;
	data[index++] = q4;
	data[index++] = q5;
	data[index++] = q6;
	

	// copy data for query atom
	System.arraycopy(center.getAsObject(), 0, data, index, length);
	index += length;

	// copy data for interacting atoms
	for (int i = 0; i < neighbors.size(); i++) {
		System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length);
		index += length;
		data[index++] = distances[i];
	}

	// copy angles
	System.arraycopy(angles, 0, data, index, angles.length);
	index += length;

	return RowFactory.create(data);
}
 
Example #21
Source File: S3CsvToDataset2.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("CSV on S3 to Dataset<Row>")
      .master("spark://10.0.100.81:7077")
      .config("spark.executor.memory", "1g")
      .config("spark.executor.cores", "1")
      .config("spark.cores.max", "2")
      .config("spark.driver.host", "10.0.100.182")
      .config("spark.executor.extraClassPath",
          "/home/jgp/net.jgp.labs.spark/target/labs-spark-2.2.0-jar-with-dependencies.jar")
      .getOrCreate();

  spark.sparkContext().hadoopConfiguration().set("fs.s3a.access.key",
      "xxx");
  spark.sparkContext().hadoopConfiguration().set("fs.s3a.secret.key",
      "xxx");
  // spark.sparkContext().hadoopConfiguration().set("fs.s3n.endpoint",
  // "us-east-2");
  String bucket = "bucket_name";
  String key = "key";

  String filename = "s3a://" + bucket + "/" + key;

  Dataset<Row> df = spark.read()
      .format("csv")
      .option("inferSchema", "true")
      .option("header", "false")
      .option("sep", "|")
      .load(filename);
  df.show();
  df.printSchema();
}
 
Example #22
Source File: WriteToDataSinkStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    /*
    TODO: Not working yet
	// if output format is set to "csv" create both: csv and parquet 
	if(SparkImporterKafkaImportArguments.getInstance().getOutputFormat().equals(SparkImporterVariables.OUTPUT_FORMAT_CSV)) {
		dataset
        .write()
        .option("header", "true")
        .option("delimiter", ";")
        .option("ignoreLeadingWhiteSpace", "false")
        .option("ignoreTrailingWhiteSpace", "false")
        .mode(SparkImporterVariables.getSaveMode())
        .csv(SparkImporterVariables.getTargetFolder());
	}
	*/
  
	dataset
            //we repartition the data by process instances, which allows spark to better distribute the data between workers as the operations are related to a process instance
            .repartition(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID))
            .write()
            .mode(SaveMode.Append)
            .save(config.getTargetFolder());

    return dataset;
}
 
Example #23
Source File: TestSuite.java    From stocator with Apache License 2.0 5 votes vote down vote up
private void countAndCompare(Dataset<Row> inSpark, long readRecords, String msg) throws Exception {
  long totalInSpark = inSpark.count();
  if (totalInSpark != readRecords) {
    System.out.println("*********************************");
    System.out.println(msg + ": Records that were written into object store doesn't match");
    System.out.println(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
    throw new Exception(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  } else {
    System.out.println(
        msg + " Completed successfully. Readed from object store: " + readRecords + ", expected: " + totalInSpark);
  }
}
 
Example #24
Source File: ColumnHashStep.java    From bpmn.ai with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {

    //check if all variables that should be hashed actually exist, otherwise log a warning
    List<String> existingColumns = new ArrayList<>(Arrays.asList(dataSet.columns()));

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(ColumnHashConfiguration chc : preprocessingConfiguration.getColumnHashConfiguration()) {
                if(chc.isHashColumn()) {
                    if(!existingColumns.contains(chc.getColumnName())) {
                        // log the fact that a column that should be hashed does not exist
                        BpmnaiLogger.getInstance().writeWarn("The column '" + chc.getColumnName() + "' is configured to be hashed, but does not exist in the data.");
                    } else {
                        dataSet = dataSet.withColumn(chc.getColumnName(), sha1(dataSet.col(chc.getColumnName())));
                        BpmnaiLogger.getInstance().writeInfo("The column '" + chc.getColumnName() + "' is being hashed.");
                    }
                }

            }
        }
    }

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "column_hash_step", config);
    }

    return dataSet;
}
 
Example #25
Source File: Dataset.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> withColumn(final String colName, final Column col) {
  final boolean userTriggered = initializeFunction(colName, col);
  final Dataset<Row> result = from(super.withColumn(colName, col));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #26
Source File: DataSparkFromRDD.java    From toolbox with Apache License 2.0 5 votes vote down vote up
@Override
public DataFrame getDataFrame(SQLContext sql) {

    // Obtain the schema
    StructType schema = SchemaConverter.getSchema(attributes);

    // Transform the RDD
    JavaRDD<Row> rowRDD = DataFrameOps.toRowRDD(amidstRDD, attributes);

    // Create the DataFrame
    return sql.createDataFrame(rowRDD, schema);
}
 
Example #27
Source File: DataFrameReader.java    From nemo with Apache License 2.0 5 votes vote down vote up
@Override
public Dataset<Row> csv(final org.apache.spark.sql.Dataset<String> csvDataset) {
  final boolean userTriggered = initializeFunction(csvDataset);
  final Dataset<Row> result = Dataset.from(super.csv(csvDataset));
  this.setIsUserTriggered(userTriggered);
  return result;
}
 
Example #28
Source File: AbstractConceptMaps.java    From bunsen with Apache License 2.0 5 votes vote down vote up
protected AbstractConceptMaps(SparkSession spark,
    FhirVersionEnum fhirVersion,
    Dataset<UrlAndVersion> members,
    Dataset<Row> conceptMaps,
    Dataset<Mapping> mappings,
    SparkRowConverter conceptMapRowConverter) {

  this.spark = spark;
  this.fhirVersion = fhirVersion;
  this.members = members;
  this.conceptMaps = conceptMaps;
  this.mappings = mappings;
  this.conceptMapRowConverter = conceptMapRowConverter;
}
 
Example #29
Source File: MLContextTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputDataFrameVectorsWithIDColumnFromMatrixDML() {
	System.out.println("MLContextTest - output DataFrame of vectors with ID column from matrix DML");

	String s = "M = matrix('1 2 3 4', rows=1, cols=4);";
	Script script = dml(s).out("M");
	Dataset<Row> df = ml.execute(script).getMatrix("M").toDFVectorWithIDColumn();
	List<Row> list = df.collectAsList();

	Row row = list.get(0);
	Assert.assertEquals(1.0, row.getDouble(0), 0.0);
	Assert.assertArrayEquals(new double[] { 1.0, 2.0, 3.0, 4.0 }, ((Vector) row.get(1)).toArray(), 0.0);
}
 
Example #30
Source File: TestPivotDeriver.java    From envelope with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleFieldEntityKeyPivot() throws Exception {
  List<Row> sourceList = Lists.newArrayList(
      RowFactory.create("A", "AA", "AAA", "hello", "1"),
      RowFactory.create("A", "AA", "AAA", "world", "2"),
      RowFactory.create("B", "BB", "BBB", "hello", "3"),
      RowFactory.create("C", "CC", "CCC", "world", "4"));
  StructType schema = DataTypes.createStructType(Lists.newArrayList(
    DataTypes.createStructField("entity_id1", DataTypes.StringType, true),
    DataTypes.createStructField("entity_id2", DataTypes.StringType, true),
    DataTypes.createStructField("entity_id3", DataTypes.StringType, true),
    DataTypes.createStructField("key", DataTypes.StringType, true),
    DataTypes.createStructField("value", DataTypes.StringType, true)
  ));
  Dataset<Row> source = Contexts.getSparkSession().createDataFrame(sourceList, schema);

  Map<String, Dataset<Row>> dependencies = Maps.newHashMap();
  dependencies.put("source", source);
  
  Config config = ConfigFactory.empty()
      .withValue(PivotDeriver.STEP_NAME_CONFIG, ConfigValueFactory.fromAnyRef("source"))
      .withValue(PivotDeriver.ENTITY_KEY_FIELD_NAMES_CONFIG, ConfigValueFactory.fromAnyRef(
          Lists.newArrayList("entity_id1", "entity_id2", "entity_id3")))
      .withValue(PivotDeriver.PIVOT_KEY_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("key"))
      .withValue(PivotDeriver.PIVOT_VALUE_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("value"));

  PivotDeriver d = new PivotDeriver();
  assertNoValidationFailures(d, config);
  d.configure(config);
  
  List<Row> results = d.derive(dependencies).collectAsList();
  
  assertEquals(results.size(), 3);
  assertTrue(results.contains(RowFactory.create("A", "AA", "AAA", "1", "2")));
  assertTrue(results.contains(RowFactory.create("B", "BB", "BBB", "3", null)));
  assertTrue(results.contains(RowFactory.create("C", "CC", "CCC", null, "4")));
}