Java Code Examples for org.apache.spark.sql.Dataset#collect()

The following examples show how to use org.apache.spark.sql.Dataset#collect() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataSetApplication.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类,可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}
 
Example 2
Source File: NFilePruningTest.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
private long assertResultsAndScanFiles(String sql, long numScanFiles) throws Exception {
    Dataset<Row> dataset = queryCubeAndSkipCompute(getProject(), sql);
    dataset.collect();
    long actualNum = findFileSourceScanExec(dataset.queryExecution().sparkPlan()).metrics().get("numFiles").get().value();
    Assert.assertEquals(numScanFiles, actualNum);
    return actualNum;
}
 
Example 3
Source File: TestSparkSchema.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testSparkReadSchemaIsHonored() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation);

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
 
Example 4
Source File: TestSparkSchema.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation)
      .select("id");

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
 
Example 5
Source File: JavaSparkSQLExample.java    From SparkDemo with MIT License 4 votes vote down vote up
private static void runDatasetCreationExample(SparkSession spark) {
  // $example on:create_ds$
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
    Collections.singletonList(person),
    personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
    @Override
    public Integer call(Integer value) throws Exception {
      return value + 1;
    }
  }, integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json";
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_ds$
}
 
Example 6
Source File: JavaStandaloneIgniteRDDSelfTest.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        sc.stop();
    }
}
 
Example 7
Source File: JavaEmbeddedIgniteRDDSelfTest.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}
 
Example 8
Source File: JavaStandaloneIgniteRDDSelfTest.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        sc.stop();
    }
}
 
Example 9
Source File: JavaEmbeddedIgniteRDDSelfTest.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}
 
Example 10
Source File: DataFrameIT.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
public static void testNth(String table, String type, Integer nthRow, Integer nthCol, ResultSet[] resultSets) throws SQLException {

        try{
            Connection conn = DriverManager.getConnection("jdbc:default:connection");
            PreparedStatement pstmt = conn.prepareStatement("select * from " + table.toUpperCase());
            ResultSet res = pstmt.executeQuery();

            // Convert result set to Dataframe
            Dataset<Row> resultSetDF = SparkUtils.resultSetToDF(res);
            //Retrieve nthRow of DataFrame
            org.apache.spark.sql.Row[] r = (Row[])resultSetDF.collect();


            //Retrieve nthRow of ResultSet
            int i = 0;
            Boolean equalsTest = false;
            while(res.next() && i<nthRow){
                i++;
            }
            //System.out.println("Type="+type+"nthrow="+nthRow+" nthcol="+nthCol+" rs="+res.getObject(nthCol)+" i="+i+" df="+r[i]);

            // if either null both have to be null
            if (res.getObject(nthCol) == null) {
                equalsTest = r[i].isNullAt(nthCol-1);
            }
            else if(r[i].isNullAt(nthCol-1)) {
                equalsTest = false;
            }
            else {

                // Test nth element of ResultSet
                switch (type.toLowerCase()){
                    case "string":
                            equalsTest = res.getString(nthCol).equals(r[i].getString(nthCol-1));
                            break;
                    case "integer":
                        equalsTest = res.getInt(nthCol) == r[i].getInt(nthCol-1);
                        break;
                    case "boolean":
                        equalsTest = res.getBoolean(nthCol) == r[i].getBoolean(nthCol-1);
                        break;
                    case "double":
                        equalsTest = res.getDouble(nthCol) == (r[i].getDouble(nthCol-1));
                        break;
                    case "timestamp":
                        equalsTest = res.getTimestamp(nthCol).equals(r[i].getTimestamp(nthCol-1));
                        break;
                    default: equalsTest = false;
                        break;
                }
              }

            // Construct Stored Procedure Result
            List<ExecRow> rows = Lists.newArrayList();
            ExecRow row = new ValueRow(1);
            row.setColumn(1, new SQLBoolean(equalsTest));
            rows.add(row);
            IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) conn, rows, DATAFRAME_NTH_STORED_PROCEDURE_COLUMN_DECSRIPTOR);
            resultSets[0] = new EmbedResultSet40((EmbedConnection)conn, resultsToWrap, false, null, true);
            conn.close();
        }
        catch (Exception e) {
            throw new SQLException(Throwables.getRootCause(e));
        }
    }