package dataframe;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.Arrays;
import java.util.List;

import static org.apache.spark.sql.functions.col;

//
// Note that conceptually a DataFrame is a DataSet<Row>, bot the Java API
// doesn't actually have a definition of DataFrame.
//
// Create a Spark Dataset<Row> from a list of Row instances and a schema
// constructed explicitly. Query it.
//
// This example is fundamental for Dataset<Row> as the chema is created
// explicitly instead of being inferred via an Encoder like in the Dataset
// examples.
//
public class FromRowsAndSchema {
    public static void main(String[] args) {
        SparkSession spark = SparkSession
            .builder()
            .appName("DataFrame-FromRowsAndSchema")
            .master("local[4]")
            .getOrCreate();

        List<Row> customerRows = Arrays.asList(
            RowFactory.create(1, "Widget Co", 120000.00, 0.00, "AZ"),
            RowFactory.create(2, "Acme Widgets", 410500.00, 500.00, "CA"),
            RowFactory.create(3, "Widgetry", 410500.00, 200.00, "CA"),
            RowFactory.create(4, "Widgets R Us", 410500.00, 0.0, "CA"),
            RowFactory.create(5, "Ye Olde Widgete", 500.00, 0.0, "MA")
        );

        List<StructField> fields = Arrays.asList(
            DataTypes.createStructField("id", DataTypes.IntegerType, true),
            DataTypes.createStructField("name", DataTypes.StringType, true),
            DataTypes.createStructField("sales", DataTypes.DoubleType, true),
            DataTypes.createStructField("discount", DataTypes.DoubleType, true),
            DataTypes.createStructField("state", DataTypes.StringType, true)
        );
        StructType customerSchema = DataTypes.createStructType(fields);

        Dataset<Row> customerDF =
            spark.createDataFrame(customerRows, customerSchema);

        System.out.println("*** the schema created");
        customerDF.printSchema();

        System.out.println("*** the data");
        customerDF.show();

        System.out.println("*** just the rows from CA");
        customerDF.filter(col("state").equalTo("CA")).show();

        spark.stop();
    }
}