Java Code Examples for org.apache.spark.sql.Dataset#javaRDD()

The following examples show how to use org.apache.spark.sql.Dataset#javaRDD() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中,查询用户访问行为数据
	// 第一个限定:click_product_id,限定为不为空的访问行为,那么就代表着点击行为
	// 第二个限定:在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}
 
Example 2
Source File: UserVisitSessionAnalyzeSpark.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
	 * 获取指定日期范围内的用户访问行为数据
	 * @param sqlContext SQLContext
	 * @param taskParam 任务参数
	 * @return 行为数据RDD
	 */
	private static JavaRDD<Row> getActionRDDByDateRange(
			SQLContext sqlContext, JSONObject taskParam) {
		String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
		String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);

		String sql =
				"select * "
						+ "from user_visit_action "
						+ "where date>='" + startDate + "' "
						+ "and date<='" + endDate + "'";
//				+ "and session_id not in('','','')"

		Dataset<Row> actionDF = sqlContext.sql(sql);

		/**
		 * 这里就很有可能发生上面说的问题
		 * 比如说,Spark SQl默认就给第一个stage设置了20个task,但是根据你的数据量以及算法的复杂度
		 * 实际上,你需要1000个task去并行执行
		 *
		 * 所以说,在这里,就可以对Spark SQL刚刚查询出来的RDD执行repartition重分区操作
		 */

//		return actionDF.javaRDD().repartition(1000);

		return actionDF.javaRDD();
	}
 
Example 3
Source File: SparkUtils.java    From BigDataPlatform with GNU General Public License v3.0 5 votes vote down vote up
/**
	 * 获取指定日期范围内的用户行为数据RDD
	 * @param sqlContext
	 * @param taskParam
	 * @return
	 */
	public static JavaRDD<Row> getActionRDDByDateRange(
			SQLContext sqlContext, JSONObject taskParam) {
		String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
		String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);
		
		String sql = 
				"select * "
				+ "from user_visit_action "
				+ "where day >='" + startDate + "' "
				+ "and day <='" + endDate + "'";
//				+ "and session_id not in('','','')"
		
		Dataset<Row> actionDF = sqlContext.sql(sql);
		
		/**
		 * 这里就很有可能发生上面说的问题
		 * 比如说,Spark SQl默认就给第一个stage设置了20个task,但是根据你的数据量以及算法的复杂度
		 * 实际上,你需要1000个task去并行执行
		 * 
		 * 所以说,在这里,就可以对Spark SQL刚刚查询出来的RDD执行repartition重分区操作
		 */
		
//		return actionDF.javaRDD().repartition(1000);
		
		return actionDF.javaRDD();
	}
 
Example 4
Source File: ValueSets.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a new ValueSets instance that includes the given value sets.
 *
 * @param valueSets the value sets to add to the returned collection.
 * @return a new ValueSets instance with the added value sets.
 */
@Override
public ValueSets withValueSets(Dataset<Row> valueSets) {

  Dataset<UrlAndVersion> newMembers = getUrlAndVersions(valueSets);

  // Ensure that there are no duplicates among the value sets
  if (hasDuplicateUrlAndVersions(newMembers) || valueSets.count() != newMembers.count()) {

    throw new IllegalArgumentException(
        "Cannot add value sets having duplicate valueSetUri and valueSetVersion");
  }

  JavaRDD<Row> valueSetsRdd = valueSets.javaRDD();

  // The value set concepts will be stored in the values table for persistence, so we remove
  // them from the individual value sets. This can be done most easily by setting concepts to an
  // empty list.
  JavaRDD<Row> withoutConceptsRdd = valueSetsRdd.map(new RemoveConcepts(fhirVersion));

  Dataset<Row> withoutConcepts = spark.createDataFrame(withoutConceptsRdd,
      valueSetRowConverter.getSchema());

  JavaRDD<Value> newValuesRdd = valueSetsRdd.flatMap(new ExtractValues(fhirVersion));

  Dataset<Value> newValues = spark.createDataset(newValuesRdd.rdd(), getValueEncoder());

  return withValueSets(withoutConcepts, newValues);
}
 
Example 5
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
 * 使用Spark SQL从MySQL中查询城市信息
 * @param sqlContext SQLContext
 * @return 
 */
private static JavaPairRDD<Long, Row> getcityid2CityInfoRDD(SQLContext sqlContext) {
	// 构建MySQL连接配置信息(直接从配置文件中获取)
	String url = null;
	String user = null;
	String password = null;
	boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
	
	if(local) {
		url = ConfigurationManager.getProperty(Constants.JDBC_URL);
		user = ConfigurationManager.getProperty(Constants.JDBC_USER);
		password = ConfigurationManager.getProperty(Constants.JDBC_PASSWORD);
	} else {
		url = ConfigurationManager.getProperty(Constants.JDBC_URL_PROD);
		user = ConfigurationManager.getProperty(Constants.JDBC_USER_PROD);
		password = ConfigurationManager.getProperty(Constants.JDBC_PASSWORD_PROD);
	}
	
	Map<String, String> options = new HashMap<String, String>();
	options.put("url", url);
	options.put("dbtable", "city_info");  
	options.put("user", user);  
	options.put("password", password);  
	
	// 通过SQLContext去从MySQL中查询数据
	Dataset<Row> cityInfoDF = sqlContext.read().format("jdbc")
			.options(options).load();
	
	// 返回RDD
	JavaRDD<Row> cityInfoRDD = cityInfoDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2cityInfoRDD = cityInfoRDD.mapToPair(
		
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = Long.valueOf(String.valueOf(row.get(0)));
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2cityInfoRDD;
}
 
Example 6
Source File: AreaTop3ProductSpark.java    From BigDataPlatform with GNU General Public License v3.0 4 votes vote down vote up
/**
 * 获取各区域top3热门商品
 * @param sqlContext
 * @return
 */
private static JavaRDD<Row> getAreaTop3ProductRDD(SQLContext sqlContext) {
	// 技术点:开窗函数
	
	// 使用开窗函数先进行一个子查询
	// 按照area进行分组,给每个分组内的数据,按照点击次数降序排序,打上一个组内的行号
	// 接着在外层查询中,过滤出各个组内的行号排名前3的数据
	// 其实就是咱们的各个区域下top3热门商品
	
	// 华北、华东、华南、华中、西北、西南、东北
	// A级:华北、华东
	// B级:华南、华中
	// C级:西北、西南
	// D级:东北
	
	// case when
	// 根据多个条件,不同的条件对应不同的值
	// case when then ... when then ... else ... end
	
	String sql = 
			"SELECT "
				+ "area,"
				+ "CASE "
					+ "WHEN area='China North' OR area='China East' THEN 'A Level' "
					+ "WHEN area='China South' OR area='China Middle' THEN 'B Level' "
					+ "WHEN area='West North' OR area='West South' THEN 'C Level' "
					+ "ELSE 'D Level' "
				+ "END area_level,"
				+ "product_id,"
				+ "click_count,"
				+ "city_infos,"
				+ "product_name,"
				+ "product_status "
			+ "FROM ("
				+ "SELECT "
					+ "area,"
					+ "product_id,"
					+ "click_count,"
					+ "city_infos,"
					+ "product_name,"
					+ "product_status,"
					+ "row_number() OVER (PARTITION BY area ORDER BY click_count DESC) rank "
				+ "FROM tmp_area_fullprod_click_count "
			+ ") t "
			+ "WHERE rank<=3";
	
	Dataset<Row> df = sqlContext.sql(sql);
	
	return df.javaRDD();
}