Java Code Examples for org.apache.spark.sql.Dataset#collect()
The following examples show how to use
org.apache.spark.sql.Dataset#collect() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataSetApplication.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local") .appName("Java Spark SQL") .getOrCreate(); Person person = new Person("spark",10); Encoder<Person> encoder = Encoders.bean(Person.class); Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder); dataset.show(); //最终输出 {name:spark;age:10} /*常见类型的编码器*/ Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder); Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) { return value+1; } },integerEncoder); result.collect(); //最终输出 [2,3] /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/ String url = "/usr/local/text.json"; Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder); personDataset.show(); //最终输出 name:... age:,,,, }
Example 2
Source File: NFilePruningTest.java From kylin-on-parquet-v2 with Apache License 2.0 | 5 votes |
private long assertResultsAndScanFiles(String sql, long numScanFiles) throws Exception { Dataset<Row> dataset = queryCubeAndSkipCompute(getProject(), sql); dataset.collect(); long actualNum = findFileSourceScanExec(dataset.queryExecution().sparkPlan()).metrics().get("numFiles").get().value(); Assert.assertEquals(numScanFiles, actualNum); return actualNum; }
Example 3
Source File: TestSparkSchema.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testSparkReadSchemaIsHonored() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) } ); Dataset<Row> resultDf = spark.read() .schema(sparkReadSchema) .format("iceberg") .load(tableLocation); Row[] results = (Row[]) resultDf.collect(); Assert.assertEquals("Result size matches", 1, results.length); Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length()); Assert.assertEquals("Row content matches data", 1, results[0].getInt(0)); }
Example 4
Source File: TestSparkSchema.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testSparkReadSchemaCombinedWithProjection() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), new StructField("data", DataTypes.StringType, true, Metadata.empty()) } ); Dataset<Row> resultDf = spark.read() .schema(sparkReadSchema) .format("iceberg") .load(tableLocation) .select("id"); Row[] results = (Row[]) resultDf.collect(); Assert.assertEquals("Result size matches", 1, results.length); Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length()); Assert.assertEquals("Row content matches data", 1, results[0].getInt(0)); }
Example 5
Source File: JavaSparkSQLExample.java From SparkDemo with MIT License | 4 votes |
private static void runDatasetCreationExample(SparkSession spark) { // $example on:create_ds$ // Create an instance of a Bean class Person person = new Person(); person.setName("Andy"); person.setAge(32); // Encoders are created for Java beans Encoder<Person> personEncoder = Encoders.bean(Person.class); Dataset<Person> javaBeanDS = spark.createDataset( Collections.singletonList(person), personEncoder ); javaBeanDS.show(); // +---+----+ // |age|name| // +---+----+ // | 32|Andy| // +---+----+ // Encoders for most common types are provided in class Encoders Encoder<Integer> integerEncoder = Encoders.INT(); Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder); Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() { @Override public Integer call(Integer value) throws Exception { return value + 1; } }, integerEncoder); transformedDS.collect(); // Returns [2, 3, 4] // DataFrames can be converted to a Dataset by providing a class. Mapping based on name String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json"; Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder); peopleDS.show(); // +----+-------+ // | age| name| // +----+-------+ // |null|Michael| // | 30| Andy| // | 19| Justin| // +----+-------+ // $example off:create_ds$ }
Example 6
Source File: JavaStandaloneIgniteRDDSelfTest.java From ignite with Apache License 2.0 | 4 votes |
/** * @throws Exception If failed. */ @Test public void testQueryFieldsFromIgnite() throws Exception { JavaSparkContext sc = new JavaSparkContext("local[*]", "test"); try { JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider()); JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME); cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F)); Dataset<Row> df = cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000); df.printSchema(); Row[] res = (Row[])df.collect(); assertEquals("Invalid result length", 1, res.length); assertEquals("Invalid result", 50, res[0].get(0)); assertEquals("Invalid result", "name50", res[0].get(1)); assertEquals("Invalid result", 5000, res[0].get(2)); Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000)); Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp); df.printSchema(); Row[] res0 = (Row[])df0.collect(); assertEquals("Invalid result length", 1, res0.length); assertEquals("Invalid result", 50, res0[0].get(0)); assertEquals("Invalid result", "name50", res0[0].get(1)); assertEquals("Invalid result", 5000, res0[0].get(2)); assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count()); } finally { sc.stop(); } }
Example 7
Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0 | 4 votes |
/** * @throws Exception If failed. */ @Test public void testQueryFieldsFromIgnite() throws Exception { JavaSparkContext sc = createContext(); JavaIgniteContext<String, Entity> ic = null; try { ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false); JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME); cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false); Dataset<Row> df = cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000); df.printSchema(); Row[] res = (Row[])df.collect(); assertEquals("Invalid result length", 1, res.length); assertEquals("Invalid result", 50, res[0].get(0)); assertEquals("Invalid result", "name50", res[0].get(1)); assertEquals("Invalid result", 5000, res[0].get(2)); Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000)); Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp); df.printSchema(); Row[] res0 = (Row[])df0.collect(); assertEquals("Invalid result length", 1, res0.length); assertEquals("Invalid result", 50, res0[0].get(0)); assertEquals("Invalid result", "name50", res0[0].get(1)); assertEquals("Invalid result", 5000, res0[0].get(2)); assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count()); } finally { if (ic != null) ic.close(true); sc.stop(); } }
Example 8
Source File: JavaStandaloneIgniteRDDSelfTest.java From ignite with Apache License 2.0 | 4 votes |
/** * @throws Exception If failed. */ @Test public void testQueryFieldsFromIgnite() throws Exception { JavaSparkContext sc = new JavaSparkContext("local[*]", "test"); try { JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider()); JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME); cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F)); Dataset<Row> df = cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000); df.printSchema(); Row[] res = (Row[])df.collect(); assertEquals("Invalid result length", 1, res.length); assertEquals("Invalid result", 50, res[0].get(0)); assertEquals("Invalid result", "name50", res[0].get(1)); assertEquals("Invalid result", 5000, res[0].get(2)); Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000)); Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp); df.printSchema(); Row[] res0 = (Row[])df0.collect(); assertEquals("Invalid result length", 1, res0.length); assertEquals("Invalid result", 50, res0[0].get(0)); assertEquals("Invalid result", "name50", res0[0].get(1)); assertEquals("Invalid result", 5000, res0[0].get(2)); assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count()); } finally { sc.stop(); } }
Example 9
Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0 | 4 votes |
/** * @throws Exception If failed. */ @Test public void testQueryFieldsFromIgnite() throws Exception { JavaSparkContext sc = createContext(); JavaIgniteContext<String, Entity> ic = null; try { ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false); JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME); cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false); Dataset<Row> df = cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000); df.printSchema(); Row[] res = (Row[])df.collect(); assertEquals("Invalid result length", 1, res.length); assertEquals("Invalid result", 50, res[0].get(0)); assertEquals("Invalid result", "name50", res[0].get(1)); assertEquals("Invalid result", 5000, res[0].get(2)); Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000)); Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp); df.printSchema(); Row[] res0 = (Row[])df0.collect(); assertEquals("Invalid result length", 1, res0.length); assertEquals("Invalid result", 50, res0[0].get(0)); assertEquals("Invalid result", "name50", res0[0].get(1)); assertEquals("Invalid result", 5000, res0[0].get(2)); assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count()); } finally { if (ic != null) ic.close(true); sc.stop(); } }
Example 10
Source File: DataFrameIT.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
public static void testNth(String table, String type, Integer nthRow, Integer nthCol, ResultSet[] resultSets) throws SQLException { try{ Connection conn = DriverManager.getConnection("jdbc:default:connection"); PreparedStatement pstmt = conn.prepareStatement("select * from " + table.toUpperCase()); ResultSet res = pstmt.executeQuery(); // Convert result set to Dataframe Dataset<Row> resultSetDF = SparkUtils.resultSetToDF(res); //Retrieve nthRow of DataFrame org.apache.spark.sql.Row[] r = (Row[])resultSetDF.collect(); //Retrieve nthRow of ResultSet int i = 0; Boolean equalsTest = false; while(res.next() && i<nthRow){ i++; } //System.out.println("Type="+type+"nthrow="+nthRow+" nthcol="+nthCol+" rs="+res.getObject(nthCol)+" i="+i+" df="+r[i]); // if either null both have to be null if (res.getObject(nthCol) == null) { equalsTest = r[i].isNullAt(nthCol-1); } else if(r[i].isNullAt(nthCol-1)) { equalsTest = false; } else { // Test nth element of ResultSet switch (type.toLowerCase()){ case "string": equalsTest = res.getString(nthCol).equals(r[i].getString(nthCol-1)); break; case "integer": equalsTest = res.getInt(nthCol) == r[i].getInt(nthCol-1); break; case "boolean": equalsTest = res.getBoolean(nthCol) == r[i].getBoolean(nthCol-1); break; case "double": equalsTest = res.getDouble(nthCol) == (r[i].getDouble(nthCol-1)); break; case "timestamp": equalsTest = res.getTimestamp(nthCol).equals(r[i].getTimestamp(nthCol-1)); break; default: equalsTest = false; break; } } // Construct Stored Procedure Result List<ExecRow> rows = Lists.newArrayList(); ExecRow row = new ValueRow(1); row.setColumn(1, new SQLBoolean(equalsTest)); rows.add(row); IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) conn, rows, DATAFRAME_NTH_STORED_PROCEDURE_COLUMN_DECSRIPTOR); resultSets[0] = new EmbedResultSet40((EmbedConnection)conn, resultsToWrap, false, null, true); conn.close(); } catch (Exception e) { throw new SQLException(Throwables.getRootCause(e)); } }