package dataset;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import scala.Tuple3;

import java.util.Arrays;
import java.util.List;

import static org.apache.spark.sql.functions.col;

//
// Create a Spark Dataset from an array of tuples. The inferred schema doesn't
// have convenient column names but it can still be queried conveniently.
//
public class Basic {
    public static void main(String[] args) {
        SparkSession spark = SparkSession
            .builder()
            .appName("Dataset-Basic")
            .master("local[4]")
            .getOrCreate();

        List<Integer> data = Arrays.asList(10, 11, 12, 13, 14, 15);
        Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());

        System.out.println("*** only one column, and it always has the same name");
        ds.printSchema();

        ds.show();

        System.out.println("*** values > 12");

        // the harder way to filter
        Dataset<Integer> ds2 = ds.filter((Integer value) -> value > 12);

        ds.show();

        List<Tuple3<Integer, String, String>> tuples =
            Arrays.asList(
                new Tuple3<>(1, "one", "un"),
                new Tuple3<>(2, "two", "deux"),
                new Tuple3<>(3, "three", "trois"));

        Encoder<Tuple3<Integer, String, String>> encoder =
            Encoders.tuple(Encoders.INT(), Encoders.STRING(), Encoders.STRING());

        Dataset<Tuple3<Integer, String, String>> tupleDS =
            spark.createDataset(tuples, encoder);

        System.out.println("*** Tuple Dataset types");
        tupleDS.printSchema();

        // the tuple columns have unfriendly names, but you can use them to query
        System.out.println("*** filter by one column and fetch another");
        tupleDS.where(col("_1").gt(2)).select(col("_2"), col("_3")).show();

        spark.stop();
    }
}