org.apache.spark.sql.Row Java Examples
The following examples show how to use
org.apache.spark.sql.Row.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkDataSourceManager.java From DDF with Apache License 2.0 | 8 votes |
@Override public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException { SparkDDFManager sparkDDFManager = (SparkDDFManager)mDDFManager; HiveContext sqlContext = sparkDDFManager.getHiveContext(); JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials)dataSource.getDataSourceCredentials(); String fullURL = dataSource.getDataSourceUri().getUri().toString(); if (cred.getUsername() != null && !cred.getUsername().equals("")) { fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword()); } Map<String, String> options = new HashMap<String, String>(); options.put("url", fullURL); options.put("dbtable", dataSource.getDbTable()); DataFrame df = sqlContext.load("jdbc", options); DDF ddf = sparkDDFManager.newDDF(sparkDDFManager, df, new Class<?>[]{DataFrame.class}, null, SparkUtils.schemaFromDataFrame(df)); // TODO? ddf.getRepresentationHandler().get(RDD.class, Row.class); ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource); return ddf; }
Example #2
Source File: GroupConcatDistinctUDAF.java From BigDataPlatform with GNU General Public License v3.0 | 6 votes |
/** * 更新 * 可以认为是,一个一个地将组内的字段值传递进来 * 实现拼接的逻辑 */ @Override public void update(MutableAggregationBuffer buffer, Row input) { // 缓冲中的已经拼接过的城市信息串 String bufferCityInfo = buffer.getString(0); // 刚刚传递进来的某个城市信息 String cityInfo = input.getString(0); // 在这里要实现去重的逻辑 // 判断:之前没有拼接过某个城市信息,那么这里才可以接下去拼接新的城市信息 if(!bufferCityInfo.contains(cityInfo)) { if("".equals(bufferCityInfo)) { bufferCityInfo += cityInfo; } else { // 比如1:北京 // 1:北京,2:上海 bufferCityInfo += "," + cityInfo; } buffer.update(0, bufferCityInfo); } }
Example #3
Source File: SparkUtil.java From kylin with Apache License 2.0 | 6 votes |
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc.sc())) .config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
Example #4
Source File: AvroTranslator.java From envelope with Apache License 2.0 | 6 votes |
private Row rowForRecord(GenericRecord record) { List<Object> values = Lists.newArrayList(); for (Field field : record.getSchema().getFields()) { Object value = record.get(field.name()); Type fieldType = field.schema().getType(); if (fieldType.equals(Type.UNION)) { fieldType = field.schema().getTypes().get(1).getType(); } // Avro returns Utf8s for strings, which Spark SQL doesn't know how to use if (fieldType.equals(Type.STRING) && value != null) { value = value.toString(); } // Avro returns binary as a ByteBuffer, but Spark SQL wants a byte[] if (fieldType.equals(Type.BYTES) && value != null) { value = ((ByteBuffer)value).array(); } values.add(value); } return new RowWithSchema(schema, values.toArray()); }
Example #5
Source File: TestRangeRowRule.java From envelope with Apache License 2.0 | 6 votes |
public void testDontIgnoreNulls() { StructType schema = new StructType(new StructField[] { new StructField("name", DataTypes.StringType, false, Metadata.empty()), new StructField("nickname", DataTypes.StringType, false, Metadata.empty()), new StructField("age", DataTypes.IntegerType, false, Metadata.empty()), new StructField("candycrushscore", DataTypes.createDecimalType(), false, Metadata.empty()) }); Map<String, Object> configMap = new HashMap<>(); configMap.put(RangeRowRule.FIELDS_CONFIG, Lists.newArrayList("age")); configMap.put(RangeRowRule.FIELD_TYPE_CONFIG, "int"); configMap.put(RangeRowRule.RANGE_CONFIG, Lists.newArrayList(0,105)); Config config = ConfigFactory.parseMap(configMap); RangeRowRule rule = new RangeRowRule(); assertNoValidationFailures(rule, config); rule.configure(config); rule.configureName("agerange"); Row row1 = new RowWithSchema(schema, "Ian", "Ian", null, new BigDecimal("0.00")); assertFalse("Row should not pass rule", rule.check(row1)); }
Example #6
Source File: DeepSparkContextTest.java From deep-spark with Apache License 2.0 | 6 votes |
@Test public void createJavaSchemaRDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = PowerMockito.mock(SQLContext.class); ExtractorConfig config = createDeepJobConfig(); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config); JavaRDD<Row> rowRDD = mock(JavaRDD.class); mockStatic(DeepSparkContext.class); when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD); Cells cells = mock(Cells.class); when(singleRdd.first()).thenReturn(cells); StructType schema = mock(StructType.class); mockStatic(CellsUtils.class); when(CellsUtils.getStructTypeFromCells(cells)).thenReturn(schema); deepSparkContextSpy.createJavaSchemaRDD(config); verify(sqlContext).applySchema(rowRDD, schema); }
Example #7
Source File: SubStringCounterRelation.java From net.jgp.labs.spark with Apache License 2.0 | 6 votes |
@Override public RDD<Row> buildScan() { log.debug("-> buildScan()"); // I have isolated the work to a method to keep the plumbing code as simple // as // possible. List<List<Integer>> table = collectData(); @SuppressWarnings("resource") // cannot be closed here, done elsewhere JavaSparkContext sparkContext = new JavaSparkContext(sqlContext .sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(table) .map(row -> RowFactory.create(row.toArray())); return rowRDD.rdd(); }
Example #8
Source File: TestPassthroughDeriver.java From envelope with Apache License 2.0 | 6 votes |
@Test public void testPassthrough() throws Exception { StructType schema = DataTypes.createStructType(Lists.<StructField>newArrayList( DataTypes.createStructField("col1", DataTypes.StringType, false))); Dataset<Row> dep1 = Contexts.getSparkSession().createDataFrame( Lists.newArrayList(RowFactory.create("a")), schema); Dataset<Row> dep2= Contexts.getSparkSession().createDataFrame( Lists.newArrayList(RowFactory.create("b")), schema); Map<String, Dataset<Row>> dependencies = Maps.newHashMap(); dependencies.put("dep1", dep1); dependencies.put("dep2", dep2); Deriver deriver = new PassthroughDeriver(); List<Row> result = deriver.derive(dependencies).collectAsList(); assertTrue(result.contains(RowFactory.create("a"))); assertTrue(result.contains(RowFactory.create("b"))); assertEquals(2, result.size()); }
Example #9
Source File: BundlesTest.java From bunsen with Apache License 2.0 | 6 votes |
@Test public void testXmlBundleStrings() { JavaRDD<String> xmlBundlesRdd = spark.sparkContext() .wholeTextFiles("src/test/resources/xml/bundles", 1) .toJavaRDD() .map(tuple -> tuple._2()); Dataset<String> xmlBundles = spark.createDataset(xmlBundlesRdd.rdd(), Encoders.STRING()); xmlBundles.write().saveAsTable("xml_bundle_table"); JavaRDD<BundleContainer> bundles = BundlesTest.bundles.fromXml( spark.sql("select value from xml_bundle_table"), "value"); Dataset<Row> patients = BundlesTest.bundles.extractEntry(spark, bundles, Patient.class); checkPatients(patients); }
Example #10
Source File: PerformanceCollector.java From ExecDashboard with Apache License 2.0 | 6 votes |
private void updateCollectorItemMetricDetail(CollectorItemMetricDetail collectorItemMetricDetail, Row row) { Date timeWindowDt = row.getAs(STR_TIMEWINDOW); List<String> performanceMetricList = Arrays.asList(STR_AVG_RESPONSE_TIME,STR_CALLSPER_MINUTE,STR_ERROR_RATE); GenericRowWithSchema pefMetrics = row.getAs("metrics"); for(String perfMetric :performanceMetricList){ double value; try { Long valueStr = pefMetrics.getAs(perfMetric); value = valueStr.doubleValue(); }catch (IllegalArgumentException exception){ value = 0.0; } MetricCount mc = getMetricCount("", value, perfMetric); if (!mc.getLabel().isEmpty()) { collectorItemMetricDetail.setStrategy(getCollectionStrategy()); collectorItemMetricDetail.addCollectorItemMetricCount(timeWindowDt, mc); collectorItemMetricDetail.setLastScanDate(timeWindowDt); } } }
Example #11
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc, boolean toVector) { if( !mc.colsKnown() ) throw new RuntimeException("Number of columns needed to convert binary block to data frame."); //slice blocks into rows, align and convert into data frame rows JavaRDD<Row> rowsRDD = in .flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getBlocksize())) .groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getBlocksize(), toVector)); //create data frame schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false)); if( toVector ) fields.add(DataTypes.createStructField("C1", new VectorUDT(), false)); else { // row for(int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false)); } //rdd to data frame conversion return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }
Example #12
Source File: NManualBuildAndQueryCuboidTest.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName, org.apache.spark.sql.types.DataType dataType) { StructField[] structFieldList = layoutDs.schema().fields(); String[] columns = layoutDs.columns(); int index = 0; StructField[] outStructFieldList = new StructField[structFieldList.length]; for (int i = 0; i < structFieldList.length; i++) { if (columns[i].equalsIgnoreCase(fieldName)) { index = i; StructField structField = structFieldList[i]; outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata()); } else { outStructFieldList[i] = structFieldList[i]; } } OUT_SCHEMA = new StructType(outStructFieldList); return index; }
Example #13
Source File: StructureToInteractingResidues.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Override public Iterator<Row> call(Tuple2<String, StructureDataInterface> t) throws Exception { String structureId = t._1; StructureDataInterface structure = t._2; List<Integer> groupIndices = new ArrayList<>(); List<String> groupNames = new ArrayList<>(); getGroupIndices(structure, groupIndices, groupNames); List<Row> neighbors = new ArrayList<>(); for (int i = 0; i < groupNames.size(); i++) { if (groupNames.get(i).equals(groupName)) { List<Integer> matches = new ArrayList<>(); float[] boundingBox = calcBoundingBox(structure, groupIndices, i, cutoffDistance); matches.addAll(findNeighbors(structure, i, boundingBox, groupIndices)); neighbors.addAll(getDistanceProfile(structureId, matches, i, groupIndices, groupNames, structure)); } } return neighbors.iterator(); }
Example #14
Source File: JavaVectorAssemblerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorAssemblerExample") .getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("hour", IntegerType, false), createStructField("mobile", DoubleType, false), createStructField("userFeatures", new VectorUDT(), false), createStructField("clicked", DoubleType, false) }); Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + "'features'"); output.select("features", "clicked").show(false); // $example off$ spark.stop(); }
Example #15
Source File: DataFrameReader.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> json(final RDD<String> jsonRDD) { final boolean userTriggered = initializeFunction(jsonRDD); final Dataset<Row> result = Dataset.from(super.json(jsonRDD)); this.setIsUserTriggered(userTriggered); return result; }
Example #16
Source File: GeoWaveSparkSQLIT.java From geowave with Apache License 2.0 | 5 votes |
@Test public void testSpatialJoin() throws Exception { // Set up Spark final SparkSession session = SparkTestEnvironment.getInstance().getDefaultSession(); final SqlQueryRunner queryRunner = new SqlQueryRunner(); queryRunner.setSparkSession(session); // ingest test points TestUtils.testLocalIngest(dataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1); TestUtils.testLocalIngest( dataStore, DimensionalityType.SPATIAL, TORNADO_TRACKS_SHAPEFILE_FILE, 1); try { // Run a valid sql query that should do a optimized join queryRunner.addInputStore(dataStore, "hail", "hail"); queryRunner.addInputStore(dataStore, "tornado_tracks", "tornado"); queryRunner.setSql( "select hail.* from hail, tornado where GeomIntersects(hail.geom, tornado.geom)"); final Dataset<Row> results = queryRunner.run(); LOGGER.warn("Indexed intersect from sql returns: " + results.count() + " results."); } catch (final Exception e) { e.printStackTrace(); TestUtils.deleteAll(dataStore); Assert.fail( "Error occurred while attempting optimized join from sql query runner: '" + e.getLocalizedMessage() + "'"); } // Clean up TestUtils.deleteAll(dataStore); }
Example #17
Source File: MorphlineDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(stepName)) { throw new RuntimeException("Step not found in the dependencies list"); } Dataset<Row> sourceStep = dependencies.get(stepName); // For each partition in the DataFrame / RDD JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap( MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty)); // Convert all the Rows into a new DataFrame return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema()); }
Example #18
Source File: RemoteDPParForSpark.java From systemds with Apache License 2.0 | 5 votes |
@Override public Tuple2<Long, Writable> call(Tuple2<Row, Long> arg0) throws Exception { long rowix = arg0._2() + 1; //process row data int off = _containsID ? 1: 0; Object obj = _isVector ? arg0._1().get(off) : arg0._1(); boolean sparse = (obj instanceof SparseVector); MatrixBlock mb = new MatrixBlock(1, (int)_clen, sparse); if( _isVector ) { Vector vect = (Vector) obj; if( vect instanceof SparseVector ) { SparseVector svect = (SparseVector) vect; int lnnz = svect.numNonzeros(); for( int k=0; k<lnnz; k++ ) mb.appendValue(0, svect.indices()[k], svect.values()[k]); } else { //dense for( int j=0; j<_clen; j++ ) mb.appendValue(0, j, vect.apply(j)); } } else { //row Row row = (Row) obj; for( int j=off; j<off+_clen; j++ ) mb.appendValue(0, j-off, UtilFunctions.getDouble(row.get(j))); } mb.examSparsity(); return new Tuple2<>(rowix, new PairWritableBlock(new MatrixIndexes(1,1),mb)); }
Example #19
Source File: StringIndexerBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test(expected=RuntimeException.class) public void testStringIndexerForUnseenValues() { //prepare data StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", DoubleType, false) }); List<Row> trainingData = Arrays.asList( cr(0, 1.0), cr(1, 2.0), cr(2, 3.0), cr(3, 1.0), cr(4, 1.0), cr(5, 3.0)); DataFrame dataset = sqlContext.createDataFrame(trainingData, schema); //train model in spark StringIndexerModel model = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex").fit(dataset); //Export this model byte[] exportedModel = ModelExporter.export(model, dataset); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //unseen value Map<String, Object> data = new HashMap<String, Object>(); data.put(model.getInputCol(), 7.0); transformer.transform(data); }
Example #20
Source File: AtomInteraction.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Returns interactions and geometric information in a single row. * * @return row of interactions and geometric information */ public Row getMultipleInteractionsAsRow(int maxInteractions) { // pad interaction centers and distances with nulls, if necessary, // since each row must be of fixed length while (getNumInteractions() < maxInteractions) { neighbors.add(new InteractionCenter()); } int length = InteractionCenter.getLength(); Object[] data = new Object[getNumColumns(maxInteractions)]; int index = 0; data[index++] = structureId; data[index++] = getNumberOfPolymerChains(); calcCoordinationGeometry(maxInteractions); data[index++] = q3; data[index++] = q4; data[index++] = q5; data[index++] = q6; // copy data for query atom System.arraycopy(center.getAsObject(), 0, data, index, length); index += length; // copy data for interacting atoms for (int i = 0; i < neighbors.size(); i++) { System.arraycopy(neighbors.get(i).getAsObject(), 0, data, index, length); index += length; data[index++] = distances[i]; } // copy angles System.arraycopy(angles, 0, data, index, angles.length); index += length; return RowFactory.create(data); }
Example #21
Source File: S3CsvToDataset2.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder() .appName("CSV on S3 to Dataset<Row>") .master("spark://10.0.100.81:7077") .config("spark.executor.memory", "1g") .config("spark.executor.cores", "1") .config("spark.cores.max", "2") .config("spark.driver.host", "10.0.100.182") .config("spark.executor.extraClassPath", "/home/jgp/net.jgp.labs.spark/target/labs-spark-2.2.0-jar-with-dependencies.jar") .getOrCreate(); spark.sparkContext().hadoopConfiguration().set("fs.s3a.access.key", "xxx"); spark.sparkContext().hadoopConfiguration().set("fs.s3a.secret.key", "xxx"); // spark.sparkContext().hadoopConfiguration().set("fs.s3n.endpoint", // "us-east-2"); String bucket = "bucket_name"; String key = "key"; String filename = "s3a://" + bucket + "/" + key; Dataset<Row> df = spark.read() .format("csv") .option("inferSchema", "true") .option("header", "false") .option("sep", "|") .load(filename); df.show(); df.printSchema(); }
Example #22
Source File: WriteToDataSinkStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { /* TODO: Not working yet // if output format is set to "csv" create both: csv and parquet if(SparkImporterKafkaImportArguments.getInstance().getOutputFormat().equals(SparkImporterVariables.OUTPUT_FORMAT_CSV)) { dataset .write() .option("header", "true") .option("delimiter", ";") .option("ignoreLeadingWhiteSpace", "false") .option("ignoreTrailingWhiteSpace", "false") .mode(SparkImporterVariables.getSaveMode()) .csv(SparkImporterVariables.getTargetFolder()); } */ dataset //we repartition the data by process instances, which allows spark to better distribute the data between workers as the operations are related to a process instance .repartition(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) .write() .mode(SaveMode.Append) .save(config.getTargetFolder()); return dataset; }
Example #23
Source File: TestSuite.java From stocator with Apache License 2.0 | 5 votes |
private void countAndCompare(Dataset<Row> inSpark, long readRecords, String msg) throws Exception { long totalInSpark = inSpark.count(); if (totalInSpark != readRecords) { System.out.println("*********************************"); System.out.println(msg + ": Records that were written into object store doesn't match"); System.out.println(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark); throw new Exception(msg + ": Readed from object store: " + readRecords + ", expected: " + totalInSpark); } else { System.out.println( msg + " Completed successfully. Readed from object store: " + readRecords + ", expected: " + totalInSpark); } }
Example #24
Source File: ColumnHashStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) { //check if all variables that should be hashed actually exist, otherwise log a warning List<String> existingColumns = new ArrayList<>(Arrays.asList(dataSet.columns())); Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config); if(configuration != null) { PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration(); if(preprocessingConfiguration != null) { for(ColumnHashConfiguration chc : preprocessingConfiguration.getColumnHashConfiguration()) { if(chc.isHashColumn()) { if(!existingColumns.contains(chc.getColumnName())) { // log the fact that a column that should be hashed does not exist BpmnaiLogger.getInstance().writeWarn("The column '" + chc.getColumnName() + "' is configured to be hashed, but does not exist in the data."); } else { dataSet = dataSet.withColumn(chc.getColumnName(), sha1(dataSet.col(chc.getColumnName()))); BpmnaiLogger.getInstance().writeInfo("The column '" + chc.getColumnName() + "' is being hashed."); } } } } } if(config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "column_hash_step", config); } return dataSet; }
Example #25
Source File: Dataset.java From incubator-nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> withColumn(final String colName, final Column col) { final boolean userTriggered = initializeFunction(colName, col); final Dataset<Row> result = from(super.withColumn(colName, col)); this.setIsUserTriggered(userTriggered); return result; }
Example #26
Source File: DataSparkFromRDD.java From toolbox with Apache License 2.0 | 5 votes |
@Override public DataFrame getDataFrame(SQLContext sql) { // Obtain the schema StructType schema = SchemaConverter.getSchema(attributes); // Transform the RDD JavaRDD<Row> rowRDD = DataFrameOps.toRowRDD(amidstRDD, attributes); // Create the DataFrame return sql.createDataFrame(rowRDD, schema); }
Example #27
Source File: DataFrameReader.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> csv(final org.apache.spark.sql.Dataset<String> csvDataset) { final boolean userTriggered = initializeFunction(csvDataset); final Dataset<Row> result = Dataset.from(super.csv(csvDataset)); this.setIsUserTriggered(userTriggered); return result; }
Example #28
Source File: AbstractConceptMaps.java From bunsen with Apache License 2.0 | 5 votes |
protected AbstractConceptMaps(SparkSession spark, FhirVersionEnum fhirVersion, Dataset<UrlAndVersion> members, Dataset<Row> conceptMaps, Dataset<Mapping> mappings, SparkRowConverter conceptMapRowConverter) { this.spark = spark; this.fhirVersion = fhirVersion; this.members = members; this.conceptMaps = conceptMaps; this.mappings = mappings; this.conceptMapRowConverter = conceptMapRowConverter; }
Example #29
Source File: MLContextTest.java From systemds with Apache License 2.0 | 5 votes |
@Test public void testOutputDataFrameVectorsWithIDColumnFromMatrixDML() { System.out.println("MLContextTest - output DataFrame of vectors with ID column from matrix DML"); String s = "M = matrix('1 2 3 4', rows=1, cols=4);"; Script script = dml(s).out("M"); Dataset<Row> df = ml.execute(script).getMatrix("M").toDFVectorWithIDColumn(); List<Row> list = df.collectAsList(); Row row = list.get(0); Assert.assertEquals(1.0, row.getDouble(0), 0.0); Assert.assertArrayEquals(new double[] { 1.0, 2.0, 3.0, 4.0 }, ((Vector) row.get(1)).toArray(), 0.0); }
Example #30
Source File: TestPivotDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Test public void testMultipleFieldEntityKeyPivot() throws Exception { List<Row> sourceList = Lists.newArrayList( RowFactory.create("A", "AA", "AAA", "hello", "1"), RowFactory.create("A", "AA", "AAA", "world", "2"), RowFactory.create("B", "BB", "BBB", "hello", "3"), RowFactory.create("C", "CC", "CCC", "world", "4")); StructType schema = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("entity_id1", DataTypes.StringType, true), DataTypes.createStructField("entity_id2", DataTypes.StringType, true), DataTypes.createStructField("entity_id3", DataTypes.StringType, true), DataTypes.createStructField("key", DataTypes.StringType, true), DataTypes.createStructField("value", DataTypes.StringType, true) )); Dataset<Row> source = Contexts.getSparkSession().createDataFrame(sourceList, schema); Map<String, Dataset<Row>> dependencies = Maps.newHashMap(); dependencies.put("source", source); Config config = ConfigFactory.empty() .withValue(PivotDeriver.STEP_NAME_CONFIG, ConfigValueFactory.fromAnyRef("source")) .withValue(PivotDeriver.ENTITY_KEY_FIELD_NAMES_CONFIG, ConfigValueFactory.fromAnyRef( Lists.newArrayList("entity_id1", "entity_id2", "entity_id3"))) .withValue(PivotDeriver.PIVOT_KEY_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("key")) .withValue(PivotDeriver.PIVOT_VALUE_FIELD_NAME_CONFIG, ConfigValueFactory.fromAnyRef("value")); PivotDeriver d = new PivotDeriver(); assertNoValidationFailures(d, config); d.configure(config); List<Row> results = d.derive(dependencies).collectAsList(); assertEquals(results.size(), 3); assertTrue(results.contains(RowFactory.create("A", "AA", "AAA", "1", "2"))); assertTrue(results.contains(RowFactory.create("B", "BB", "BBB", "3", null))); assertTrue(results.contains(RowFactory.create("C", "CC", "CCC", null, "4"))); }