org.apache.spark.sql.Dataset#collect

Source File: DataSetApplication.java From sparkResearch with Apache License 2.0

5 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();
    Person person = new Person("spark",10);
    Encoder<Person> encoder = Encoders.bean(Person.class);
    Dataset<Person> dataset = sparkSession.createDataset(Collections.singletonList(person),encoder);
    dataset.show();
    //最终输出 {name:spark;age:10}


    /*常见类型的编码器*/
    Encoder<Integer> integerEncoder = Encoders.INT();
    Dataset<Integer> integerDataset = sparkSession.createDataset(Arrays.asList(1,2),integerEncoder);
    Dataset<Integer> result = integerDataset.map(new MapFunction<Integer, Integer>() {
        @Override
        public Integer call(Integer value) {
            return value+1;
        }
    },integerEncoder);
    result.collect();
    //最终输出 [2,3]

    /*通过提供一个类，可以将数据流转换为数据集。基于名称的映射*/
    String url = "/usr/local/text.json";
    Dataset<Person> personDataset = sparkSession.read().json(url).as(encoder);
    personDataset.show();
    //最终输出 name:...  age:,,,,
}

Source File: NFilePruningTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private long assertResultsAndScanFiles(String sql, long numScanFiles) throws Exception {
    Dataset<Row> dataset = queryCubeAndSkipCompute(getProject(), sql);
    dataset.collect();
    long actualNum = findFileSourceScanExec(dataset.queryExecution().sparkPlan()).metrics().get("numFiles").get().value();
    Assert.assertEquals(numScanFiles, actualNum);
    return actualNum;
}

Source File: TestSparkSchema.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkReadSchemaIsHonored() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation);

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}

Source File: TestSparkSchema.java From iceberg with Apache License 2.0

5 votes

@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  tables.create(SCHEMA, spec, null, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a")
  );
  Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  originalDf.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  StructType sparkReadSchema =
      new StructType(
          new StructField[] {
              new StructField("id", DataTypes.IntegerType, true, Metadata.empty()),
              new StructField("data", DataTypes.StringType, true, Metadata.empty())
          }
      );

  Dataset<Row> resultDf = spark.read()
      .schema(sparkReadSchema)
      .format("iceberg")
      .load(tableLocation)
      .select("id");

  Row[] results = (Row[]) resultDf.collect();

  Assert.assertEquals("Result size matches", 1, results.length);
  Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
  Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runDatasetCreationExample(SparkSession spark) {
  // $example on:create_ds$
  // Create an instance of a Bean class
  Person person = new Person();
  person.setName("Andy");
  person.setAge(32);

  // Encoders are created for Java beans
  Encoder<Person> personEncoder = Encoders.bean(Person.class);
  Dataset<Person> javaBeanDS = spark.createDataset(
    Collections.singletonList(person),
    personEncoder
  );
  javaBeanDS.show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 32|Andy|
  // +---+----+

  // Encoders for most common types are provided in class Encoders
  Encoder<Integer> integerEncoder = Encoders.INT();
  Dataset<Integer> primitiveDS = spark.createDataset(Arrays.asList(1, 2, 3), integerEncoder);
  Dataset<Integer> transformedDS = primitiveDS.map(new MapFunction<Integer, Integer>() {
    @Override
    public Integer call(Integer value) throws Exception {
      return value + 1;
    }
  }, integerEncoder);
  transformedDS.collect(); // Returns [2, 3, 4]

  // DataFrames can be converted to a Dataset by providing a class. Mapping based on name
  String path = Constant.LOCAL_FILE_PREX +"/data/resources/people.json";
  Dataset<Person> peopleDS = spark.read().json(path).as(personEncoder);
  peopleDS.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_ds$
}

Source File: JavaStandaloneIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        sc.stop();
    }
}

Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}

Source File: JavaStandaloneIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = new JavaSparkContext("local[*]", "test");

    try {
        JavaIgniteContext<String, Entity> ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider());

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(ENTITY_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), 2).mapToPair(INT_TO_ENTITY_F));

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        sc.stop();
    }
}

Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0

4 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testQueryFieldsFromIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, Entity> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        JavaIgniteRDD<String, Entity> cache = ic.fromCache(PARTITIONED_CACHE_NAME);

        cache.savePairs(sc.parallelize(F.range(0, 1001), GRID_CNT).mapToPair(INT_TO_ENTITY_F), true, false);

        Dataset<Row> df =
            cache.sql("select id, name, salary from Entity where name = ? and salary = ?", "name50", 5000);

        df.printSchema();

        Row[] res = (Row[])df.collect();

        assertEquals("Invalid result length", 1, res.length);
        assertEquals("Invalid result", 50, res[0].get(0));
        assertEquals("Invalid result", "name50", res[0].get(1));
        assertEquals("Invalid result", 5000, res[0].get(2));

        Column exp = new Column("NAME").equalTo("name50").and(new Column("SALARY").equalTo(5000));

        Dataset<Row> df0 = cache.sql("select id, name, salary from Entity").where(exp);

        df.printSchema();

        Row[] res0 = (Row[])df0.collect();

        assertEquals("Invalid result length", 1, res0.length);
        assertEquals("Invalid result", 50, res0[0].get(0));
        assertEquals("Invalid result", "name50", res0[0].get(1));
        assertEquals("Invalid result", 5000, res0[0].get(2));

        assertEquals("Invalid count", 500, cache.sql("select id from Entity where id > 500").count());
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}

Source File: DataFrameIT.java From spliceengine with GNU Affero General Public License v3.0

4 votes

public static void testNth(String table, String type, Integer nthRow, Integer nthCol, ResultSet[] resultSets) throws SQLException {

        try{
            Connection conn = DriverManager.getConnection("jdbc:default:connection");
            PreparedStatement pstmt = conn.prepareStatement("select * from " + table.toUpperCase());
            ResultSet res = pstmt.executeQuery();

            // Convert result set to Dataframe
            Dataset<Row> resultSetDF = SparkUtils.resultSetToDF(res);
            //Retrieve nthRow of DataFrame
            org.apache.spark.sql.Row[] r = (Row[])resultSetDF.collect();


            //Retrieve nthRow of ResultSet
            int i = 0;
            Boolean equalsTest = false;
            while(res.next() && i<nthRow){
                i++;
            }
            //System.out.println("Type="+type+"nthrow="+nthRow+" nthcol="+nthCol+" rs="+res.getObject(nthCol)+" i="+i+" df="+r[i]);

            // if either null both have to be null
            if (res.getObject(nthCol) == null) {
                equalsTest = r[i].isNullAt(nthCol-1);
            }
            else if(r[i].isNullAt(nthCol-1)) {
                equalsTest = false;
            }
            else {

                // Test nth element of ResultSet
                switch (type.toLowerCase()){
                    case "string":
                            equalsTest = res.getString(nthCol).equals(r[i].getString(nthCol-1));
                            break;
                    case "integer":
                        equalsTest = res.getInt(nthCol) == r[i].getInt(nthCol-1);
                        break;
                    case "boolean":
                        equalsTest = res.getBoolean(nthCol) == r[i].getBoolean(nthCol-1);
                        break;
                    case "double":
                        equalsTest = res.getDouble(nthCol) == (r[i].getDouble(nthCol-1));
                        break;
                    case "timestamp":
                        equalsTest = res.getTimestamp(nthCol).equals(r[i].getTimestamp(nthCol-1));
                        break;
                    default: equalsTest = false;
                        break;
                }
              }

            // Construct Stored Procedure Result
            List<ExecRow> rows = Lists.newArrayList();
            ExecRow row = new ValueRow(1);
            row.setColumn(1, new SQLBoolean(equalsTest));
            rows.add(row);
            IteratorNoPutResultSet resultsToWrap = wrapResults((EmbedConnection) conn, rows, DATAFRAME_NTH_STORED_PROCEDURE_COLUMN_DECSRIPTOR);
            resultSets[0] = new EmbedResultSet40((EmbedConnection)conn, resultsToWrap, false, null, true);
            conn.close();
        }
        catch (Exception e) {
            throw new SQLException(Throwables.getRootCause(e));
        }
    }

Java Code Examples for org.apache.spark.sql.Dataset#collect()