Java Code Examples for org.apache.spark.sql.Dataset#collect()

The following examples show how to use org.apache.spark.sql.Dataset#collect() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: DataSetApplication.java From sparkResearch with Apache License 2.0

5 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类，可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}

Example 2

Source File: NFilePruningTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private long assertResultsAndScanFiles(String sql, long numScanFiles) throws Exception {
    Dataset<Row> dataset = queryCubeAndSkipCompute(getProject(), sql);
    dataset.collect();
    long actualNum = findFileSourceScanExec(dataset.queryExecution().sparkPlan()).metrics().get("numFiles").get().value();
    Assert.assertEquals(numScanFiles, actualNum);
    return actualNum;
}

Example 3

Source File: TestSparkSchema.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkReadSchemaIsHonored() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation);

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}

Example 4

Source File: TestSparkSchema.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation)
      .select("id");

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}

Example 5

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runDatasetCreationExample(SparkSession spark) {
  // $example on:create_ds$
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
    Collections.singletonList(person),
    personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
    @Override
    public Integer call(Integer value) throws Exception {
      return value + 1;
    }
  }, integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json";
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_ds$
}

Example 6

Source File: JavaStandaloneIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        sc.stop();
    }
}

Example 7

Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}

Example 8

Source File: JavaStandaloneIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        sc.stop();
    }
}

Example 9

Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}

Example 10

Source File: DataFrameIT.java From spliceengine with GNU Affero General Public License v3.0

4 votes

public static void testNth(String table, String type, Integer nthRow, Integer nthCol, ResultSet[] resultSets) throws SQLException {

        try{
            Connection conn = DriverManager.getConnection("jdbc:default:connection");
            PreparedStatement pstmt = conn.prepareStatement("select * from " + table.toUpperCase());
            ResultSet res = pstmt.executeQuery();

            // Convert result set to Dataframe
            Dataset<Row> resultSetDF = SparkUtils.resultSetToDF(res);
            //Retrieve nthRow of DataFrame
            org.apache.spark.sql.Row[] r = (Row[])resultSetDF.collect();


            //Retrieve nthRow of ResultSet
            int i = 0;
            Boolean equalsTest = false;
            while(res.next() && i<nthRow){
                i++;
            }
            //System.out.println("Type="+type+"nthrow="+nthRow+" nthcol="+nthCol+" rs="+res.getObject(nthCol)+" i="+i+" df="+r[i]);

            // if either null both have to be null
            if (res.getObject(nthCol) == null) {
                equalsTest = r[i].isNullAt(nthCol-1);
            }
            else if(r[i].isNullAt(nthCol-1)) {
                equalsTest = false;
            }
            else {

                // Test nth element of ResultSet
                switch (type.toLowerCase()){
                    case "string":
                            equalsTest = res.getString(nthCol).equals(r[i].getString(nthCol-1));
                            break;
                    case "integer":
                        equalsTest = res.getInt(nthCol) == r[i].getInt(nthCol-1);
                        break;
                    case "boolean":
                        equalsTest = res.getBoolean(nthCol) == r[i].getBoolean(nthCol-1);
                        break;
                    case "double":
                        equalsTest = res.getDouble(nthCol) == (r[i].getDouble(nthCol-1));
                        break;
                    case "timestamp":
                        equalsTest = res.getTimestamp(nthCol).equals(r[i].getTimestamp(nthCol-1));
                        break;
                    default: equalsTest = false;
                        break;
                }
              }

            // Construct Stored Procedure Result
            List<ExecRow> rows = Lists.newArrayList();
            ExecRow row = new ValueRow(1);
            row.setColumn(1, new SQLBoolean(equalsTest));
            rows.add(row);
            IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) conn, rows, DATAFRAME_NTH_STORED_PROCEDURE_COLUMN_DECSRIPTOR);
            resultSets[0] = new EmbedResultSet40((EmbedConnection)conn, resultsToWrap, false, null, true);
            conn.close();
        }
        catch (Exception e) {
            throw new SQLException(Throwables.getRootCause(e));
        }
    }