Java Code Examples for org.apache.spark.sql.Dataset#createOrReplaceTempView()

The following examples show how to use org.apache.spark.sql.Dataset#createOrReplaceTempView() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JavaShakespeare.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession spark = SparkSession.builder()
            .appName("spark-bigquery-demo")
            .getOrCreate();

    // Use the Cloud Storage bucket for temporary BigQuery export data used
    // by the connector. This assumes the Cloud Storage connector for
    // Hadoop is configured.
    String bucket = spark.sparkContext().hadoopConfiguration().get("fs.gs.system.bucket");
    spark.conf().set("temporaryGcsBucket", bucket);

    // Load data in from BigQuery.
    Dataset<Row> wordsDF = spark.read().format("bigquery")
            .option("table", "bigquery-public-data.samples.shakespeare").load().cache();
    wordsDF.show();
    wordsDF.printSchema();
    wordsDF.createOrReplaceTempView("words");

    // Perform word count.
    Dataset<Row> wordCountDF = spark.sql(
            "SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word");

    // Saving the data to BigQuery
    wordCountDF.write().format("bigquery").option("table", "wordcount_dataset.wordcount_output")
            .save();
}
 
Example 2
Source File: JavaIgniteDataFrameExample.java    From ignite with Apache License 2.0 6 votes vote down vote up
/** */
private static void nativeSparkSqlExample(SparkSession spark) {
    System.out.println("Querying using Spark SQL.");

    Dataset<Row> df = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE()) //Data source type.
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") //Table to read.
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) //Ignite config.
            .load();

    //Registering DataFrame as Spark view.
    df.createOrReplaceTempView("person");

    //Selecting data from Ignite through Spark SQL Engine.
    Dataset<Row> igniteDF = spark.sql("SELECT * FROM person WHERE id >= 2 AND name = 'Mary Major'");

    System.out.println("Result schema:");

    igniteDF.printSchema(); //Printing query schema to console.

    System.out.println("Result content:");

    igniteDF.show(); //Printing query results to console.
}
 
Example 3
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test3(SparkSession spark, Dataset<Row> schemaFlights, String outCSV1) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T3: Run SQL query on the dataset");
    schemaFlights.createOrReplaceTempView("flights");
    Dataset<Row> results = spark.sql(
        "select year, month, count(*) as Count  from flights WHERE cancellationCode like 'B' GROUP BY year, month ORDER By year, month");
    System.out.println("*********************************");
    System.out.println("T3: Save query result as CSV object: ");
    System.out.println("T3 " + outCSV1);
    results.write().mode("overwrite").format("com.databricks.spark.csv").save(outCSV1);
    // read and compare
    System.out.println("*********************************");
    System.out.println("T3: Read query CSV object and compare");
    JavaRDD<String> csvReadBack = spark.read().textFile(outCSV1).javaRDD();
    countAndCompare(results, csvReadBack.count(), "T3");
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outCSV1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }

}
 
Example 4
Source File: CustomDataFrame.java    From sparkResearch with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}
 
Example 5
Source File: CustomReportService.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Concatenates structureId and chainId fields into a single key if chainId
 * field is present
 * 
 * @param spark
 * @param dataset
 * @return
 */
private static Dataset<Row> concatIds(SparkSession spark, Dataset<Row> dataset, String[] columnNames) {
	if (Arrays.asList(dataset.columns()).contains("chainId")) {
		dataset.createOrReplaceTempView("table");

		String sql = "SELECT CONCAT(structureId,'.',chainId) as structureChainId," + "structureId,chainId,"
				+ columNamesString(columnNames) + " from table";
		dataset = spark.sql(sql);
	}
	return dataset;
}
 
Example 6
Source File: CustomReportDemo.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * @param args no input arguments
 * @throws IOException if custom report web service fails
 */
public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
   
    // retrieve PDB annotation: Binding affinities (Ki, Kd), 
    // group name of the ligand (hetId), and the 
    // Enzyme Classification number (ecNo)
    Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo");
    
    // show the schema of this dataset
    ds.printSchema();
        
    // select structures that either have a Ki or Kd value(s) and
    // are protein-serine/threonine kinases (EC 2.7.1.*):
    
    // A. by using dataset operations
    ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
     
    // B. by creating a temporary query and running SQL
    ds.createOrReplaceTempView("table");
    ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
    
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 7
Source File: Loader.java    From AWS-MIMIC-IIItoOMOP with Apache License 2.0 5 votes vote down vote up
public void loadSourceFiles(Mapping mapping)
{
    for(String file : mapping.getSourceFiles())
    {
        String table = file.contains("/") ? file.substring(0, file.lastIndexOf("/")): file.substring(0, file.lastIndexOf("."));
        Dataset<Row> frame = spark.read().format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "false").option("nullValue", "").option("treatEmptyValuesAsNulls","true").load(configuration.getFileSystem() + "://" + configuration.getSourceBucket() + "/" + file);
        frame.createOrReplaceTempView(table.replace("/", "_").replace("-", "_"));
    }
}
 
Example 8
Source File: SparkSessionHeloWorld.java    From Apache-Spark-2x-for-Java-Developers with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkSession sparkSession = SparkSession.builder()
			.master("local")
			.appName("CSV Read Example")
			.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse")
			.getOrCreate();
	
	Dataset<Row> csv = sparkSession.read().format("com.databricks.spark.csv").option("header","true")
			.load("C:\\Users\\sgulati\\Documents\\my_docs\\book\\testdata\\emp.csv");
	
	csv.createOrReplaceTempView("test");
	Dataset<Row> sql = sparkSession.sql("select * from test");
	sql.collectAsList();
}
 
Example 9
Source File: SparkStreamingSqlAnalyse.java    From sylph with Apache License 2.0 5 votes vote down vote up
/**
 * 预编译sql 而不是等到运行时,才发现错误
 * Precompiled sql instead of waiting for the runtime to find the error
 */
private static void checkDStream(
        SparkSession spark,
        String sourceTableName,
        StructType sourceSchema,
        List<Consumer<SparkSession>> handlers
)
{
    RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class));
    Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema);
    df.createOrReplaceTempView(sourceTableName);
    handlers.forEach(x -> x.accept(spark));
    spark.sql("drop view " + sourceTableName);
}
 
Example 10
Source File: JavaSQLDataSourceExample.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void runJsonDatasetExample(SparkSession spark) {
  // $example on:json_dataset$
  // A JSON dataset is pointed to by path.
  // The path can be either a single text file or a directory storing text files
  Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // The inferred schema can be visualized using the printSchema() method
  people.printSchema();
  // root
  //  |-- age: long (nullable = true)
  //  |-- name: string (nullable = true)

  // Creates a temporary view using the DataFrame
  people.createOrReplaceTempView("people");

  // SQL statements can be run by using the sql methods provided by spark
  Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");
  namesDF.show();
  // +------+
  // |  name|
  // +------+
  // |Justin|
  // +------+

  // Alternatively, a DataFrame can be created for a JSON dataset represented by
  // an RDD[String] storing one JSON object per string.
  List<String> jsonData = Arrays.asList(
          "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
  JavaRDD<String> anotherPeopleRDD =
          new JavaSparkContext(spark.sparkContext()).parallelize(jsonData);
  Dataset anotherPeople = spark.read().json(anotherPeopleRDD);
  anotherPeople.show();
  // +---------------+----+
  // |        address|name|
  // +---------------+----+
  // |[Columbus,Ohio]| Yin|
  // +---------------+----+
  // $example off:json_dataset$
}
 
Example 11
Source File: JavaSQLDataSourceExample.java    From SparkDemo with MIT License 5 votes vote down vote up
private static void runBasicParquetExample(SparkSession spark) {
  // $example on:basic_parquet_example$
  Dataset<Row> peopleDF = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // DataFrames can be saved as Parquet files, maintaining the schema information
  peopleDF.write().parquet("people.parquet");

  // Read in the Parquet file created above.
  // Parquet files are self-describing so the schema is preserved
  // The result of loading a parquet file is also a DataFrame
  Dataset<Row> parquetFileDF = spark.read().parquet("people.parquet");

  // Parquet files can also be used to create a temporary view and then used in SQL statements
  parquetFileDF.createOrReplaceTempView("parquetFile");
  Dataset<Row> namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19");
  Dataset<String> namesDS = namesDF.map(new MapFunction<Row, String>() {
    public String call(Row row) {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +------------+
  // |       value|
  // +------------+
  // |Name: Justin|
  // +------------+
  // $example off:basic_parquet_example$
}
 
Example 12
Source File: ReadAndWriteTablesTest.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void readFromPartitionedTableWithFilter() {
  table = tables.create(schema, pathToTable.toString());

  Dataset<Row> results = spark.read()
      .format("iceberg")
      .load(pathToTable.toString())
      .filter("data != \"b\"");

  results.createOrReplaceTempView("table");
  spark.sql("SELECT * FROM table").show();
}
 
Example 13
Source File: SparkDataSet.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override @SuppressWarnings({ "unchecked", "rawtypes" })
public void pin(ExecRow template, long conglomId) throws StandardException {
    Dataset<Row> pinDF = SpliceSpark.getSession().createDataFrame(
            rdd.map(new LocatedRowToRowFunction()),
            template.schema());
    pinDF.createOrReplaceTempView("SPLICE_"+conglomId);
    SpliceSpark.getSession().catalog().cacheTable("SPLICE_"+conglomId);
}
 
Example 14
Source File: DataFrameLoader.java    From ExecDashboard with Apache License 2.0 5 votes vote down vote up
public static void loadDataFrame(String collectionName, JavaSparkContext javaSparkContext) {
    if (StringUtils.isEmpty(collectionName) || (javaSparkContext == null)) { return; }

    Map<String, String> readOverrides = new HashMap<>();
    readOverrides.put("collection", collectionName);
    ReadConfig readConfig = ReadConfig.create(javaSparkContext).withOptions(readOverrides);
    Dataset<Row> frame = MongoSpark.load(javaSparkContext, readConfig).toDF();
    frame.createOrReplaceTempView(collectionName);
}
 
Example 15
Source File: AbstractJavaEsSparkSQLTest.java    From elasticsearch-hadoop with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicRead() throws Exception {
       Dataset<Row> dataset = artistsAsDataset();
       assertTrue(dataset.count() > 300);
       dataset.createOrReplaceTempView("datfile");
       assertEquals(5, ((Object[]) dataset.take(5)).length);
       Dataset<Row> results = sqc
			.sql("SELECT name FROM datfile WHERE id >=1 AND id <=10");
       assertEquals(10, ((Object[]) results.take(10)).length);
}
 
Example 16
Source File: JavaSparkSQLExample.java    From SparkDemo with MIT License 4 votes vote down vote up
private static void runProgrammaticSchemaExample(SparkSession spark) {
  // $example on:programmatic_schema$
  // Create an RDD
  JavaRDD<String> peopleRDD = spark.sparkContext()
    .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1)
    .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
    @Override
    public Row call(String record) throws Exception {
      String[] attributes = record.split(",");
      return RowFactory.create(attributes[0], attributes[1].trim());
    }
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView("people");

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
  // $example off:programmatic_schema$
}
 
Example 17
Source File: DatasetOperations.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) throws AnalysisException {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("DatasetOperations")
      //.enableHiveSupport()
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	  //Create a RDD
	  JavaRDD<String> deptRDD = sparkSession.sparkContext()
			  .textFile("src/main/resources/dept.txt", 1)
			  .toJavaRDD();

	  //Convert the RDD to RDD<Rows>
	 JavaRDD<Row> deptRows = deptRDD.filter(str-> !str.contains("deptno")).map(new Function<String, Row>() {
		private static final long serialVersionUID = 1L;
		@Override
		public Row call(String rowString) throws Exception {
			String[] cols = rowString.split(",");
		    return RowFactory.create(cols[0].trim(), cols[1].trim(),cols[2].trim());
		}
	});
	  
	  //Create schema 		  
	  String[] schemaArr=deptRDD.first().split(",");
	  List<StructField> structFieldList = new ArrayList<>();
	  for (String fieldName : schemaArr) {
	    StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
	    structFieldList.add(structField);
	  }
	  StructType schema = DataTypes.createStructType(structFieldList);
	  
	  Dataset<Row> deptDf = sparkSession.createDataFrame(deptRows, schema);
	  deptDf.printSchema();
	  deptDf.show();
	  
	  deptDf.createOrReplaceTempView("dept");	
	  
	  Dataset<Row> result = sparkSession.sql("select loc,count(loc) from dept  where deptno > 10 group by loc" );
	  result.show();
	  
	  
	 // sparkSession.newSession().sql("SELECT * FROM dept").show();
	  
	  
        deptDf.createGlobalTempView("dept_global_view");
	  
	  sparkSession.newSession().sql("SELECT deptno,dname,loc, rank() OVER (PARTITION BY loc ORDER BY deptno ) FROM global_temp.dept_global_view").show();
	 
	//  sparkSession.newSession().sql("SELECT * FROM dept_global_view").show();
	  
	  deptDf.write().mode(SaveMode.Overwrite).json("src/main/resources/output/dept");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").save("src/main/resources/output/deptText");
	  deptDf.write().mode("overwrite").format("csv").save("src/main/resources/output/deptText");
	 
  
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").saveAsTable("Department");
	  deptDf.write().mode(SaveMode.Overwrite).format("csv").option("path", "file:///E:/hadoop/bin").saveAsTable("Department");
	  
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
		 
		 emp_ds.printSchema();
		 emp_ds.show();
		 
		emp_ds.select("empName" ,"empId").show();
		
		emp_ds.select(col("empName").name("Employee Name") ,col("empId").cast(DataTypes.IntegerType).name("Employee Id")).show();
		
		emp_ds.sort(col("empId").asc()).filter(col("salary").gt("2500"));
		
		emp_ds.select("job").groupBy(col("job")).count().show();
		
		//emp_ds.as("A").join(deptDf.as("B"),col("deptno"),"left").printSchema();

		emp_ds.as("A").join(deptDf.as("B"),emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"left").select("A.empId","A.empName","A.job","A.manager","A.hiredate","A.salary","A.comm","A.deptno","B.dname","B.loc").show();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").show();			
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").logicalPlan();
		
		emp_ds.join(deptDf,emp_ds.col("deptno").equalTo(deptDf.col("deptno")),"right").explain();
		 
          sparkSession.sql("show functions").show(false);
          sparkSession.sql("DESCRIBE FUNCTION add_months").show(false);
          sparkSession.sql("DESCRIBE FUNCTION EXTENDED add_months").show(false);
          
         
}
 
Example 18
Source File: JavaSparkSQLExample.java    From SparkDemo with MIT License 4 votes vote down vote up
private static void runBasicDataFrameExample(SparkSession spark) throws AnalysisException {
  // $example on:create_df$
  Dataset<Row> df = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // Displays the content of the DataFrame to stdout
  df.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_df$

  // $example on:untyped_ops$
  // Print the schema in a tree format
  df.printSchema();
  // root
  // |-- age: long (nullable = true)
  // |-- name: string (nullable = true)

  // Select only the "name" column
  df.select("name").show();
  // +-------+
  // |   name|
  // +-------+
  // |Michael|
  // |   Andy|
  // | Justin|
  // +-------+

  // Select everybody, but increment the age by 1
  df.select(col("name"), col("age").plus(1)).show();
  // +-------+---------+
  // |   name|(age + 1)|
  // +-------+---------+
  // |Michael|     null|
  // |   Andy|       31|
  // | Justin|       20|
  // +-------+---------+

  // Select people older than 21
  df.filter(col("age").gt(21)).show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 30|Andy|
  // +---+----+

  // Count people by age
  df.groupBy("age").count().show();
  // +----+-----+
  // | age|count|
  // +----+-----+
  // |  19|    1|
  // |null|    1|
  // |  30|    1|
  // +----+-----+
  // $example off:untyped_ops$

  // $example on:run_sql$
  // Register the DataFrame as a SQL temporary view
  df.createOrReplaceTempView("people");

  Dataset<Row> sqlDF = spark.sql("SELECT * FROM people");
  sqlDF.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:run_sql$

  // $example on:global_temp_view$
  // Register the DataFrame as a global temporary view
  df.createGlobalTempView("people");

  // Global temporary view is tied to a system preserved database `global_temp`
  spark.sql("SELECT * FROM global_temp.people").show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+

  // Global temporary view is cross-session
  spark.newSession().sql("SELECT * FROM global_temp.people").show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:global_temp_view$
}
 
Example 19
Source File: JavaSparkHiveExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  // $example on:spark_hive$
  // warehouseLocation points to the default location for managed databases and tables
  String warehouseLocation = "spark-warehouse";
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark Hive Example")
    .config("spark.sql.warehouse.dir", warehouseLocation)
    .enableHiveSupport()
    .getOrCreate();

  spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
  spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src");

  // Queries are expressed in HiveQL
  spark.sql("SELECT * FROM src").show();
  // +---+-------+
  // |key|  value|
  // +---+-------+
  // |238|val_238|
  // | 86| val_86|
  // |311|val_311|
  // ...

  // Aggregation queries are also supported.
  spark.sql("SELECT COUNT(*) FROM src").show();
  // +--------+
  // |count(1)|
  // +--------+
  // |    500 |
  // +--------+

  // The results of SQL queries are themselves DataFrames and support all normal functions.
  Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

  // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
  Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Key: " + row.get(0) + ", Value: " + row.get(1);
    }
  }, Encoders.STRING());
  stringsDS.show();
  // +--------------------+
  // |               value|
  // +--------------------+
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // ...

  // You can also use DataFrames to create temporary views within a SparkSession.
  List<Record> records = new ArrayList<>();
  for (int key = 1; key < 100; key++) {
    Record record = new Record();
    record.setKey(key);
    record.setValue("val_" + key);
    records.add(record);
  }
  Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
  recordsDF.createOrReplaceTempView("records");

  // Queries can then join DataFrames data with data stored in Hive.
  spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
  // +---+------+---+------+
  // |key| value|key| value|
  // +---+------+---+------+
  // |  2| val_2|  2| val_2|
  // |  2| val_2|  2| val_2|
  // |  4| val_4|  4| val_4|
  // ...
  // $example off:spark_hive$

  spark.stop();
}
 
Example 20
Source File: StructuredStreamingExample.java    From Apache-Spark-2x-for-Java-Developers with MIT License 3 votes vote down vote up
public static void main(String[] args) throws StreamingQueryException {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkSession sparkSession = SparkSession.builder().master("local[*]").appName("structured Streaming Example")
			.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

	Dataset<Row> inStream = sparkSession.readStream().format("socket").option("host", "10.204.136.223")
			.option("port", 9999).load();

	Dataset<FlightDetails> dsFlightDetails = inStream.as(Encoders.STRING()).map(x -> {
		ObjectMapper mapper = new ObjectMapper();
		return mapper.readValue(x, FlightDetails.class);

	}, Encoders.bean(FlightDetails.class));
	
	
	dsFlightDetails.createOrReplaceTempView("flight_details");
	
	Dataset<Row> avdFlightDetails = sparkSession.sql("select flightId, avg(temperature) from flight_details group by flightId");
	
	StreamingQuery query = avdFlightDetails.writeStream()
			  .outputMode("complete")
			  .format("console")
			  .start();

			query.awaitTermination();
	

}