Java Code Examples for org.apache.spark.sql.Dataset#registerTempTable()

The following examples show how to use org.apache.spark.sql.Dataset#registerTempTable() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SqlQueryBasedTransformer.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Override
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset<Row> rowDataset,
    TypedProperties properties) {
  String transformerSQL = properties.getString(Config.TRANSFORMER_SQL);
  if (null == transformerSQL) {
    throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")");
  }

  // tmp table name doesn't like dashes
  String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
  LOG.info("Registering tmp table : " + tmpTable);
  rowDataset.registerTempTable(tmpTable);
  String sqlStr = transformerSQL.replaceAll(SRC_PATTERN, tmpTable);
  LOG.info("SQL Query for transformation : (" + sqlStr + ")");
  return sparkSession.sql(sqlStr);
}
 
Example 2
Source File: TestWebServiceGet.java    From quetzal with Eclipse Public License 2.0 6 votes vote down vote up
public static void main( String[] args )
   {   	
//   	SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("local[2]");
//      	SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://Kavithas-MBP.home:7077");
	SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://kavithas-mbp.watson.ibm.com:7077");
   
   	JavaSparkContext sc = new JavaSparkContext(conf);
   	
   	HiveContext sqlContext = new HiveContext(sc.sc());
   	Dataset urls = sqlContext.read().json("/tmp/urls.json");

   	urls.registerTempTable("urls");
   	Dataset<Row> temp = sqlContext.sql("select * from urls");
   	temp.show();
   	
	   	sqlContext.sql("add jar /tmp/quetzal.jar");
	sqlContext.sql("create temporary function webservice as 'com.ibm.research.rdf.store.utilities.WebServiceGetUDTF'");
	Dataset<Row> drugs = sqlContext.sql("select webservice(\"drug,id,action\", \"url\", \"\", \"GET\", \"xs=http://www.w3.org/2001/XMLSchema\", \"//row\",\"drug\",\"./drug\","
			+ " \"<string>\", \"id\", \"./id\",\"<string>\", \"action\", \"./action\", \"<string>\", url) as (drug, drug_typ, id, id_typ, action, action_typ) from urls");
	drugs.show();
	System.out.println("Num rows:" + drugs.count());
   }
 
Example 3
Source File: AbstractJavaEsSparkSQLTest.java    From elasticsearch-hadoop with Apache License 2.0 6 votes vote down vote up
@Test
   public void testEsDataset2Read() throws Exception {
	String target = resource("sparksql-test-scala-basic-write", "data", version);

       // Dataset<Row> dataset = JavaEsSparkSQL.esDF(sqc, target);
       Dataset<Row> dataset = sqc.read().format("es").load(target);
       assertTrue(dataset.count() > 300);
       String schema = dataset.schema().treeString();
	System.out.println(schema);
	assertTrue(schema.contains("id: long"));
	assertTrue(schema.contains("name: string"));
	assertTrue(schema.contains("pictures: string"));
	assertTrue(schema.contains("time: long"));
	assertTrue(schema.contains("url: string"));

       // Dataset.take(5).foreach(println)

       dataset.registerTempTable("basicRead");
       Dataset<Row> nameRDD = sqc
			.sql("SELECT name FROM basicRead WHERE id >= 1 AND id <=10");
	assertEquals(10, nameRDD.count());
}
 
Example 4
Source File: CaseWhenTest.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("CaseWhenTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row>  gradeLevelDF = sqlContext.sql(
			"SELECT CASE "
				+ "WHEN grade>=90 THEN 'A' "
				+ "WHEN grade>=80 THEN 'B' "
				+ "WHEN grade>=70 THEN 'C' "
				+ "WHEN grade>=60 THEN 'D' "
				+ "ELSE 'E' "
				+ "END gradeLevel "
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 5
Source File: IfTest.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("IfTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row> gradeLevelDF = sqlContext.sql(
			"SELECT IF(grade>=80,'GOOD','BAD') gradeLevel "  
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}
 
Example 6
Source File: MetroAnalysisJob.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }
 
Example 7
Source File: FlatteningTransformer.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Configs supported.
 */
@Override
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset<Row> rowDataset,
    TypedProperties properties) {

  // tmp table name doesn't like dashes
  String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
  LOG.info("Registering tmp table : " + tmpTable);
  rowDataset.registerTempTable(tmpTable);
  return sparkSession.sql("select " + flattenSchema(rowDataset.schema(), null) + " from " + tmpTable);
}
 
Example 8
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
 * 生成点击商品基础信息临时表
 * @param sqlContext
 * @param cityid2clickActionRDD
 * @param cityid2cityInfoRDD
 */
private static void generateTempClickProductBasicTable(
		SQLContext sqlContext,
		JavaPairRDD<Long, Row> cityid2clickActionRDD,
		JavaPairRDD<Long, Row> cityid2cityInfoRDD) {
	// 执行join操作,进行点击行为数据和城市数据的关联
	JavaPairRDD<Long, Tuple2<Row, Row>> joinedRDD =
			cityid2clickActionRDD.join(cityid2cityInfoRDD);
	
	// 将上面的JavaPairRDD,转换成一个JavaRDD<Row>(才能将RDD转换为Dataset<Row>)
	JavaRDD<Row> mappedRDD = joinedRDD.map(
			
			new Function<Tuple2<Long,Tuple2<Row,Row>>, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Row call(Tuple2<Long, Tuple2<Row, Row>> tuple)
						throws Exception {
					Long cityid = tuple._1;
					Row clickAction = tuple._2._1;
					Row cityInfo = tuple._2._2;
					
					Long productid = clickAction.getLong(1);
					String cityName = cityInfo.getString(1);
					String area = cityInfo.getString(2);
					
					return RowFactory.create(cityid, cityName, area, productid);  
				}
				
			});
	
	// 基于JavaRDD<Row>的格式,就可以将其转换为Dataset<Row>
	List<StructField> structFields = new ArrayList<StructField>();
	structFields.add(DataTypes.createStructField("city_id", DataTypes.LongType, true));
	structFields.add(DataTypes.createStructField("city_name", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("area", DataTypes.StringType, true));
	structFields.add(DataTypes.createStructField("product_id", DataTypes.LongType, true));
	
	// 1 北京
	// 2 上海
	// 1 北京
	// group by area,product_id
	// 1:北京,2:上海
	
	// 两个函数
	// UDF:concat2(),将两个字段拼接起来,用指定的分隔符
	// UDAF:group_concat_distinct(),将一个分组中的多个字段值,用逗号拼接起来,同时进行去重
	
	StructType schema = DataTypes.createStructType(structFields);

	Dataset<Row> df = sqlContext.createDataFrame(mappedRDD, schema);
	System.out.println("tmp_click_product_basic: " + df.count());  
	
	// 将Dataset<Row>中的数据,注册成临时表(tmp_click_product_basic)
	df.registerTempTable("tmp_click_product_basic");  
}
 
Example 9
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
	 * 生成各区域各商品点击次数临时表
	 * @param sqlContext
	 */
	private static void generateTempAreaPrdocutClickCountTable(
			SQLContext sqlContext) {
		// 按照area和product_id两个字段进行分组
		// 计算出各区域各商品的点击次数
		// 可以获取到每个area下的每个product_id的城市信息拼接起来的串
		String sql = 
				"SELECT "
					+ "area,"
					+ "product_id,"
					+ "count(*) click_count, "  
					+ "group_concat_distinct(concat_Long_string(city_id,city_name,':')) city_infos "
				+ "FROM tmp_click_product_basic "
				+ "GROUP BY area,product_id ";
		
		/**
		 * 双重group by
		 */
		
//		String _sql = 
//				"SELECT "
//					+ "product_id_area,"
//					+ "count(click_count) click_count,"
//					+ "group_concat_distinct(city_infos) city_infos "
//				+ "FROM ( "
//					+ "SELECT "
//						+ "remove_random_prefix(product_id_area) product_id_area,"
//						+ "click_count,"
//						+ "city_infos "
//					+ "FROM ( "
//						+ "SELECT "
//							+ "product_id_area,"
//							+ "count(*) click_count,"
//							+ "group_concat_distinct(concat_Long_string(city_id,city_name,':')) city_infos "
//						+ "FROM ( "
//							+ "SELECT "  
//								+ "random_prefix(concat_Long_string(product_id,area,':'), 10) product_id_area,"
//								+ "city_id,"
//								+ "city_name "
//							+ "FROM tmp_click_product_basic "
//						+ ") t1 "
//						+ "GROUP BY product_id_area "
//					+ ") t2 "  
//				+ ") t3 "
//				+ "GROUP BY product_id_area ";  
		
		// 使用Spark SQL执行这条SQL语句
		Dataset<Row> df = sqlContext.sql(sql);
		
		System.out.println("tmp_area_product_click_count: " + df.count());  
		
		// 再次将查询出来的数据注册为一个临时表
		// 各区域各商品的点击次数(以及额外的城市列表)
		df.registerTempTable("tmp_area_product_click_count");    
	}
 
Example 10
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
	 * 生成区域商品点击次数临时表(包含了商品的完整信息)
	 * @param sqlContext
	 */
	private static void generateTempAreaFullProductClickCountTable(SQLContext sqlContext) {
		// 将之前得到的各区域各商品点击次数表,product_id
		// 去关联商品信息表,product_id,product_name和product_status
		// product_status要特殊处理,0,1,分别代表了自营和第三方的商品,放在了一个json串里面
		// get_json_object()函数,可以从json串中获取指定的字段的值
		// if()函数,判断,如果product_status是0,那么就是自营商品;如果是1,那么就是第三方商品
		// area, product_id, click_count, city_infos, product_name, product_status
		
		// 为什么要费时费力,计算出来商品经营类型
		// 你拿到到了某个区域top3热门的商品,那么其实这个商品是自营的,还是第三方的
		// 其实是很重要的一件事
		
		// 技术点:内置if函数的使用
		
		String sql = 
				"SELECT "
					+ "tapcc.area,"
					+ "tapcc.product_id,"
					+ "tapcc.click_count,"
					+ "tapcc.city_infos,"
					+ "pi.product_name,"
					+ "if(get_json_object(pi.extend_info,'product_status')='0','Self','Third Party') product_status "
				+ "FROM tmp_area_product_click_count tapcc "
				+ "JOIN product_info pi ON tapcc.product_id=pi.product_id ";
		
//		JavaRDD<Row> rdd = sqlContext.sql("select * from product_info").javaRDD();
//		JavaRDD<Row> flattedRDD = rdd.flatMap(new FlatMapFunction<Row, Row>() {
//
//			private static final long serialVersionUID = 1L;
//
//			@Override
//			public Iterable<Row> call(Row row) throws Exception {
//				List<Row> list = new ArrayList<Row>();
//				
//				for(int i = 0; i < 10; i ++) {
//					Long productid = row.getLong(0);
//					String _productid = i + "_" + productid;
//					
//					Row _row = RowFactory.create(_productid, row.get(1), row.get(2));
//					list.add(_row);
//				}
//				
//				return list;
//			}
//			
//		});
//		
//		StructType _schema = DataTypes.createStructType(Arrays.asList(
//				DataTypes.createStructField("product_id", DataTypes.StringType, true),
//				DataTypes.createStructField("product_name", DataTypes.StringType, true),
//				DataTypes.createStructField("product_status", DataTypes.StringType, true)));
//		
//		Dataset<Row> _df = sqlContext.createDataset<Row>(flattedRDD, _schema);
//		_df.registerTempTable("tmp_product_info");  
//		
//		String _sql = 
//				"SELECT "
//					+ "tapcc.area,"
//					+ "remove_random_prefix(tapcc.product_id) product_id," 
//					+ "tapcc.click_count,"
//					+ "tapcc.city_infos,"
//					+ "pi.product_name,"
//					+ "if(get_json_object(pi.extend_info,'product_status')=0,'自营商品','第三方商品') product_status "
//				+ "FROM ("
//					+ "SELECT "
//						+ "area,"
//						+ "random_prefix(product_id, 10) product_id,"
//						+ "click_count,"
//						+ "city_infos "
//					+ "FROM tmp_area_product_click_count "
//				+ ") tapcc "
//				+ "JOIN tmp_product_info pi ON tapcc.product_id=pi.product_id ";
		
		Dataset<Row> df = sqlContext.sql(sql);
		
		System.out.println("tmp_area_fullprod_click_count: " + df.count());  
		
		df.registerTempTable("tmp_area_fullprod_click_count");   
	}
 
Example 11
Source File: SQLQueryFastq.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryFastq");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(sc);

  Options options = new Options();

  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option samOpt = new Option( "format", true, "parquet or fastq" );
  Option baminOpt = new Option( "in", true, "" );
  options.addOption( new Option( "tablename", true, "Default sql table name is 'records'"));

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( samOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq";
  String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records";

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

  JavaRDD<MyRead> rdd = fastqRDD.map(record -> {
    MyRead read = new MyRead();
    read.setKey(record._1.toString());
    read.setSequence(record._2.getSequence().toString());
    read.setRead(record._2.getRead());
    read.setQuality(record._2.getQuality().toString());

    read.setTile(record._2.getTile());
    read.setXpos(record._2.getXpos());
    read.setYpos(record._2.getYpos());
    read.setRunNumber(record._2.getRunNumber());
    read.setInstrument(record._2.getInstrument());
    read.setFlowcellId(record._2.getFlowcellId());
    read.setLane(record._2.getLane());
    read.setControlNumber(record._2.getControlNumber());
    read.setFilterPassed(record._2.getFilterPassed());

    return read;
  });

  Dataset df = sqlContext.createDataFrame(rdd, MyRead.class);
  df.registerTempTable(tablename);
  //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records"
  //"SELECT key,LEN(sequence) as l FROM records where l<100;"
  if(query!=null) {

    //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam));
    //Save as parquet file
    Dataset<Row> resultDF = sqlContext.sql(query);
    resultDF.show(100, false);

    if(outDir!=null){
      if(format.equals("fastq")){
        JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF);
        resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
      }
      else
        resultDF.write().parquet(outDir);
    }
  }
  sc.stop();

}
 
Example 12
Source File: SQLQueryBAM.java    From ViraPipe with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}
 
Example 13
Source File: RDD2DataFrameReflection.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(RDD2DataFrameReflection.class);

	SQLContext sqlContext = new SQLContext(sc);

	JavaRDD<String> lineRDD = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt");

	JavaRDD<Row> rowsRDD = lineRDD.map(new Function<String, Row>() {

		@Override
		public Row call(String line) throws Exception {
			String[] lineSplited = line.split(",");

			return RowFactory.create(lineSplited[0], Integer.valueOf(lineSplited[1]));
		}
	});

	// 动态构造元数据,这里用的动态创建元数据
	// 如果不确定有哪些列,这些列需要从数据库或配置文件中加载出来!!!!
	List<StructField> fields = new ArrayList<StructField>();
	fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));

	StructType schema = DataTypes.createStructType(fields);

	// 根据表数据和元数据schema创建临时表
	// Spark2.0之后,DataFrame和DataSet合并为更高级的DataSet,新的DataSet具有两个不同的API特性:
	// 1.非强类型(untyped),DataSet[Row]是泛型对象的集合,它的别名是DataFrame;
	// 2.强类型(strongly-typed),DataSet[T]是具体对象的集合,如scala和java中定义的类
	Dataset<Row> dataset = sqlContext.createDataFrame(rowsRDD, schema);
	dataset.registerTempTable("person");

	Dataset<Row> personDataSet = sqlContext.sql("select * from person");

	List<Row> list = personDataSet.javaRDD().collect();

	// 一行记录
	for (Row r : list) {
		System.out.println(r);
	}

	sc.close();
}
 
Example 14
Source File: HoodieJavaStreamingApp.java    From hudi with Apache License 2.0 4 votes vote down vote up
/**
 * Adding data to the streaming source and showing results over time.
 * 
 * @param spark
 * @param fs
 * @param inputDF1
 * @param inputDF2
 * @throws Exception
 */
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception {
  inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
  // wait for spark streaming to process one microbatch
  Thread.sleep(3000);
  String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
  LOG.info("First commit at instant time :" + commitInstantTime1);

  inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath);
  // wait for spark streaming to process one microbatch
  Thread.sleep(3000);
  String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
  LOG.info("Second commit at instant time :" + commitInstantTime2);

  /**
   * Read & do some queries
   */
  Dataset<Row> hoodieROViewDF = spark.read().format("org.apache.hudi")
      // pass any path glob, can include hoodie & non-hoodie
      // datasets
      .load(tablePath + "/*/*/*/*");
  hoodieROViewDF.registerTempTable("hoodie_ro");
  spark.sql("describe hoodie_ro").show();
  // all trips whose fare amount was greater than 2.
  spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();

  if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
    /**
     * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
     */
    Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
        .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
        // Only changes in write 2 above
        .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
        // For incremental view, pass in the root/base path of dataset
        .load(tablePath);

    LOG.info("You will only see records from : " + commitInstantTime2);
    hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
  }
}