Java Code Examples for org.apache.spark.sql.SQLContext#sql()

The following examples show how to use org.apache.spark.sql.SQLContext#sql() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: MultiExpressionScript.java From HiveQLUnit with Apache License 2.0

6 votes

/**
 * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
 * Each expression is run on the provided HiveContext.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 */
@Override
public void runScript(SQLContext sqlContext) {
    String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
    for (String expression : expressions) {
        sqlContext.sql(expression);
    }
}

Example 2

Source File: TestSparkTableUtil.java From iceberg with Apache License 2.0

6 votes

@Before
public void before() {

  // Create a hive table.
  SQLContext sc = new SQLContext(TestSparkTableUtil.spark);

  sc.sql(String.format(
                  "CREATE TABLE %s (\n" +
                  "    id int COMMENT 'unique id'\n" +
                  ")\n" +
                  " PARTITIONED BY (data string)\n" +
                  " LOCATION '%s'", qualifiedTableName, tableLocationStr)
  );

  List<SimpleRecord> expected = Lists.newArrayList(
          new SimpleRecord(1, "a"),
          new SimpleRecord(2, "b"),
          new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").orderBy("data").write()
          .mode("append")
          .insertInto(qualifiedTableName);
}

Example 3

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
 * 查询指定日期范围内的点击行为数据
 * @param sqlContext 
 * @param startDate 起始日期
 * @param endDate 截止日期
 * @return 点击行为数据
 */
private static JavaPairRDD<Long, Row> getcityid2ClickActionRDDByDate(
		SQLContext sqlContext, String startDate, String endDate) {
	// 从user_visit_action中，查询用户访问行为数据
	// 第一个限定：click_product_id，限定为不为空的访问行为，那么就代表着点击行为
	// 第二个限定：在用户指定的日期范围内的数据
	
	String sql = 
			"SELECT "
				+ "city_id,"
				+ "click_product_id product_id "
			+ "FROM user_visit_action "
			+ "WHERE click_product_id IS NOT NULL "			
			+ "AND day>='" + startDate + "' "
			+ "AND day<='" + endDate + "'";
	
	Dataset<Row> clickActionDF = sqlContext.sql(sql);

	JavaRDD<Row> clickActionRDD = clickActionDF.javaRDD();

	JavaPairRDD<Long, Row> cityid2clickActionRDD = clickActionRDD.mapToPair(
			
			new PairFunction<Row, Long, Row>() {

				private static final long serialVersionUID = 1L;

				@Override
				public Tuple2<Long, Row> call(Row row) throws Exception {
					Long cityid = row.getLong(0);
					return new Tuple2<Long, Row>(cityid, row);
				}
				
			});
	
	return cityid2clickActionRDD;
}

Example 4

Source File: MultiExpressionScript.java From HiveQLUnit with Apache License 2.0

5 votes

/**
 * Splits the bundled hql script into multiple expressions using ScriptSlitter utility class.
 * Each expression is run on the provided HiveContext.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 * @return the row results acquired from the last executed expression
 */
@Override
public List<Row> runScriptReturnResults(SQLContext sqlContext) {
    String[] expressions = ScriptSplitter.splitScriptIntoExpressions(script);
    for (int i = 0; i < expressions.length - 1; i++) {
        String expression = expressions[i];
        sqlContext.sql(expression);
    }

    List<Row> rows = sqlContext.sql(expressions[expressions.length - 1]).collectAsList();
    return rows;
}

Example 5

Source File: UserVisitSessionAnalyzeSpark.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
	 * 获取指定日期范围内的用户访问行为数据
	 * @param sqlContext SQLContext
	 * @param taskParam 任务参数
	 * @return 行为数据RDD
	 */
	private static JavaRDD<Row> getActionRDDByDateRange(
			SQLContext sqlContext, JSONObject taskParam) {
		String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
		String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);

		String sql =
				"select * "
						+ "from user_visit_action "
						+ "where date>='" + startDate + "' "
						+ "and date<='" + endDate + "'";
//				+ "and session_id not in('','','')"

		Dataset<Row> actionDF = sqlContext.sql(sql);

		/**
		 * 这里就很有可能发生上面说的问题
		 * 比如说，Spark SQl默认就给第一个stage设置了20个task，但是根据你的数据量以及算法的复杂度
		 * 实际上，你需要1000个task去并行执行
		 *
		 * 所以说，在这里，就可以对Spark SQL刚刚查询出来的RDD执行repartition重分区操作
		 */

//		return actionDF.javaRDD().repartition(1000);

		return actionDF.javaRDD();
	}

Example 6

Source File: SparkUtils.java From BigDataPlatform with GNU General Public License v3.0

5 votes

/**
	 * 获取指定日期范围内的用户行为数据RDD
	 * @param sqlContext
	 * @param taskParam
	 * @return
	 */
	public static JavaRDD<Row> getActionRDDByDateRange(
			SQLContext sqlContext, JSONObject taskParam) {
		String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
		String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);
		
		String sql = 
				"select * "
				+ "from user_visit_action "
				+ "where day >='" + startDate + "' "
				+ "and day <='" + endDate + "'";
//				+ "and session_id not in('','','')"
		
		Dataset<Row> actionDF = sqlContext.sql(sql);
		
		/**
		 * 这里就很有可能发生上面说的问题
		 * 比如说，Spark SQl默认就给第一个stage设置了20个task，但是根据你的数据量以及算法的复杂度
		 * 实际上，你需要1000个task去并行执行
		 * 
		 * 所以说，在这里，就可以对Spark SQL刚刚查询出来的RDD执行repartition重分区操作
		 */
		
//		return actionDF.javaRDD().repartition(1000);
		
		return actionDF.javaRDD();
	}

Example 7

Source File: CaseWhenTest.java From BigDataPlatform with GNU General Public License v3.0

5 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("CaseWhenTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row>  gradeLevelDF = sqlContext.sql(
			"SELECT CASE "
				+ "WHEN grade>=90 THEN 'A' "
				+ "WHEN grade>=80 THEN 'B' "
				+ "WHEN grade>=70 THEN 'C' "
				+ "WHEN grade>=60 THEN 'D' "
				+ "ELSE 'E' "
				+ "END gradeLevel "
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}

Example 8

Source File: IfTest.java From BigDataPlatform with GNU General Public License v3.0

5 votes

public static void main(String[] args) {
	SparkConf conf = new SparkConf()
			.setMaster("local") 
			.setAppName("IfTest");
	JavaSparkContext sc = new JavaSparkContext(conf);
	SQLContext sqlContext = new SQLContext(sc.sc());
	
	List<Integer> grades = Arrays.asList(85, 90, 60, 73);
	JavaRDD<Integer> gradesRDD = sc.parallelize(grades);
	JavaRDD<Row> gradeRowsRDD = gradesRDD.map(new Function<Integer, Row>() {

		private static final long serialVersionUID = 1L;

		@Override
		public Row call(Integer grade) throws Exception {
			return RowFactory.create(grade);
		}
		
	});
	
	StructType schema = DataTypes.createStructType(Arrays.asList(
			DataTypes.createStructField("grade", DataTypes.IntegerType, true)));
	Dataset<Row> gradesDF = sqlContext.createDataFrame(gradeRowsRDD, schema);
	gradesDF.registerTempTable("grades");

	Dataset<Row> gradeLevelDF = sqlContext.sql(
			"SELECT IF(grade>=80,'GOOD','BAD') gradeLevel "  
			+ "FROM grades");
	
	gradeLevelDF.show();
	
	sc.close(); 
}

Example 9

Source File: TestSparkTableUtil.java From iceberg with Apache License 2.0

5 votes

@After
public void after() throws IOException {
  // Drop the hive table.
  SQLContext sc = new SQLContext(TestSparkTableUtil.spark);
  sc.sql(String.format("DROP TABLE IF EXISTS %s", qualifiedTableName));

  // Delete the data corresponding to the table.
  tableLocationPath.getFileSystem(CONF).delete(tableLocationPath, true);
}

Example 10

Source File: MetroAnalysisJob.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
     * 数据逻辑处理
     * @param sparkContext
     * @param inPutPath
     * @param outPutPath
     */
    private void deal(JavaSparkContext sparkContext, String inPutPath, String outPutPath) {
        SparkJobUtil.checkFileExists(inPutPath);

        SQLContext sqlContext = new SQLContext(sparkContext);
//        sqlContext.setConf("spark.sql.parquet.binaryAsString","true");

        //创建快照临时表
        Dataset<Row> dataset = sqlContext.read().json(inPutPath);
        dataset.registerTempTable("hui_metro_testjson");
        dataset.show(10);

        Dataset<Row> resultFrame = sqlContext.sql(SQL);

        if (resultFrame.count() > 0) {
            resultFrame.repartition(3).write()
                    .mode(SaveMode.Append).json(outPutPath);
        }

        resultFrame.show(10);

        //结果写入数据库
        MySQLJdbcConfig jdbcConfig = new MySQLJdbcConfig();
        jdbcConfig.init();
        resultFrame.write().mode("append")
                .jdbc(jdbcConfig.getUrl(), "hui_metro_test", jdbcConfig.getConnectionProperties());
    }

Example 11

Source File: UserVisitAnalyze.java From UserActionAnalyzePlatform with Apache License 2.0

5 votes

/**
 * 获取指定日期范围内的数据
 * @param sc
 * @param taskParam
 * @return
 */
private static JavaRDD<Row> getActionRDD(SQLContext sc, JSONObject taskParam)
{
    String startTime=ParamUtils.getParam(taskParam,Constants.PARAM_STARTTIME);
    String endTime=ParamUtils.getParam(taskParam,Constants.PARAM_ENDTIME);
    String sql="select *from user_visit_action where date>='"+startTime+"' and date<='"+endTime+"'";
    DataFrame df=sc.sql(sql);
    return df.javaRDD();
}

Example 12

Source File: SparkSqlInterpreter.java From Explorer with Apache License 2.0

4 votes

@Override
public InterpreterResult interpret(String st) {

    SQLContext sqlc = getSparkInterpreter().getSQLContext();
    SparkContext sc = sqlc.sparkContext();
    sc.setJobGroup(jobGroup, "Notebook", false);
    DataFrame dataFrame;
    Row[] rows = null;
    try {
        dataFrame = sqlc.sql(st);
        rows = dataFrame.take(maxResult + 1);
    } catch (Exception e) {
        logger.error("Error", e);
        sc.clearJobGroup();
        return new InterpreterResult(Code.ERROR, e.getMessage());
    }

    String msg = null;
    // get field names
    List<Attribute> columns = scala.collection.JavaConverters.asJavaListConverter(
            dataFrame.queryExecution().analyzed().output()).asJava();
    for (Attribute col : columns) {
        if (msg == null) {
            msg = col.name();
        } else {
            msg += "\t" + col.name();
        }
    }
    msg += "\n";

    // ArrayType, BinaryType, BooleanType, ByteType, DecimalType, DoubleType, DynamicType, FloatType, FractionalType, IntegerType, IntegralType, LongType, MapType, NativeType, NullType, NumericType, ShortType, StringType, StructType

    for (int r = 0; r < maxResult && r < rows.length; r++) {
        Row row = rows[r];

        for (int i = 0; i < columns.size(); i++) {
            if (!row.isNullAt(i)) {
                msg += row.apply(i).toString();
            } else {
                msg += "null";
            }
            if (i != columns.size() - 1) {
                msg += "\t";
            }
        }
        msg += "\n";
    }

    if (rows.length > maxResult) {
        msg += "\n<font color=red>Results are limited by " + maxResult + ".</font>";
    }
    InterpreterResult rett = new InterpreterResult(Code.SUCCESS, "%table " + msg);
    sc.clearJobGroup();
    return rett;
}

Example 13

Source File: RDD2DataFrameReflection.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(RDD2DataFrameReflection.class);

	SQLContext sqlContext = new SQLContext(sc);

	JavaRDD<String> lineRDD = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt");

	JavaRDD<Row> rowsRDD = lineRDD.map(new Function<String, Row>() {

		@Override
		public Row call(String line) throws Exception {
			String[] lineSplited = line.split(",");

			return RowFactory.create(lineSplited[0], Integer.valueOf(lineSplited[1]));
		}
	});

	// 动态构造元数据,这里用的动态创建元数据
	// 如果不确定有哪些列，这些列需要从数据库或配置文件中加载出来!!!!
	List<StructField> fields = new ArrayList<StructField>();
	fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));

	StructType schema = DataTypes.createStructType(fields);

	// 根据表数据和元数据schema创建临时表
	// Spark2.0之后，DataFrame和DataSet合并为更高级的DataSet，新的DataSet具有两个不同的API特性：
	// 1.非强类型(untyped)，DataSet[Row]是泛型对象的集合，它的别名是DataFrame；
	// 2.强类型(strongly-typed)，DataSet[T]是具体对象的集合，如scala和java中定义的类
	Dataset<Row> dataset = sqlContext.createDataFrame(rowsRDD, schema);
	dataset.registerTempTable("person");

	Dataset<Row> personDataSet = sqlContext.sql("select * from person");

	List<Row> list = personDataSet.javaRDD().collect();

	// 一行记录
	for (Row r : list) {
		System.out.println(r);
	}

	sc.close();
}

Example 14

Source File: SQLQueryBAM.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryBAM");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new HiveContext(sc.sc());

  Options options = new Options();
  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option baminOpt = new Option( "in", true, "" );

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String bwaOutDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String bamin = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  //Read BAM/SAM from HDFS
  JavaPairRDD<LongWritable, SAMRecordWritable> bamPairRDD = sc.newAPIHadoopFile(bamin, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, sc.hadoopConfiguration());
  //Map to SAMRecord RDD
  JavaRDD<SAMRecord> samRDD = bamPairRDD.map(v1 -> v1._2().get());
  JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag()));

  Dataset<Row> samDF = sqlContext.createDataFrame(rdd, MyAlignment.class);
  samDF.registerTempTable(tablename);
  if(query!=null) {

    //Save as parquet file
    Dataset df2 = sqlContext.sql(query);
    df2.show(100,false);

    if(bwaOutDir!=null)
      df2.write().parquet(bwaOutDir);

  }else{
    if(bwaOutDir!=null)
      samDF.write().parquet(bwaOutDir);
  }

  sc.stop();

}

Example 15

Source File: SQLQueryFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {
  SparkConf conf = new SparkConf().setAppName("SQLQueryFastq");

  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(sc);

  Options options = new Options();

  Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
  Option queryOpt = new Option( "query", true, "SQL query string." );
  Option samOpt = new Option( "format", true, "parquet or fastq" );
  Option baminOpt = new Option( "in", true, "" );
  options.addOption( new Option( "tablename", true, "Default sql table name is 'records'"));

  options.addOption( opOpt );
  options.addOption( queryOpt );
  options.addOption( samOpt );
  options.addOption( baminOpt );
  CommandLineParser parser = new BasicParser();
  CommandLine cmd = null;
  try {
    // parse the command line arguments
    cmd = parser.parse( options, args );

  }
  catch( ParseException exp ) {
    // oops, something went wrong
    System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
  }

  String outDir = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
  String query = (cmd.hasOption("query")==true)? cmd.getOptionValue("query"):null;
  String format = (cmd.hasOption("format")==true)? cmd.getOptionValue("format"):"fastq";
  String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
  tablename = (cmd.hasOption("tablename")==true)? cmd.getOptionValue("tablename"):"records";

  sc.hadoopConfiguration().setBoolean(BAMInputFormat.KEEP_PAIRED_READS_TOGETHER_PROPERTY, true);

  JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(in, FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

  JavaRDD<MyRead> rdd = fastqRDD.map(record -> {
    MyRead read = new MyRead();
    read.setKey(record._1.toString());
    read.setSequence(record._2.getSequence().toString());
    read.setRead(record._2.getRead());
    read.setQuality(record._2.getQuality().toString());

    read.setTile(record._2.getTile());
    read.setXpos(record._2.getXpos());
    read.setYpos(record._2.getYpos());
    read.setRunNumber(record._2.getRunNumber());
    read.setInstrument(record._2.getInstrument());
    read.setFlowcellId(record._2.getFlowcellId());
    read.setLane(record._2.getLane());
    read.setControlNumber(record._2.getControlNumber());
    read.setFilterPassed(record._2.getFilterPassed());

    return read;
  });

  Dataset df = sqlContext.createDataFrame(rdd, MyRead.class);
  df.registerTempTable(tablename);
  //eq. count duplicates "SELECT count(DISTINCT(sequence)) FROM records"
  //"SELECT key,LEN(sequence) as l FROM records where l<100;"
  if(query!=null) {

    //JavaRDD<MyAlignment> rdd = samRDD.map(bam -> new MyAlignment(bam.getReadName(), bam.getStart(), bam.getReferenceName(), bam.getReadLength(), new String(bam.getReadBases(), StandardCharsets.UTF_8), bam.getCigarString(), bam.getReadUnmappedFlag(), bam.getDuplicateReadFlag(), bam));
    //Save as parquet file
    Dataset<Row> resultDF = sqlContext.sql(query);
    resultDF.show(100, false);

    if(outDir!=null){
      if(format.equals("fastq")){
        JavaPairRDD<Text, SequencedFragment> resultRDD = dfToFastqRDD(resultDF);
        resultRDD.saveAsNewAPIHadoopFile(outDir, Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());
      }
      else
        resultDF.write().parquet(outDir);
    }
  }
  sc.stop();

}

Example 16

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

4 votes

/**
 * 获取各区域top3热门商品
 * @param sqlContext
 * @return
 */
private static JavaRDD<Row> getAreaTop3ProductRDD(SQLContext sqlContext) {
	// 技术点：开窗函数
	
	// 使用开窗函数先进行一个子查询
	// 按照area进行分组，给每个分组内的数据，按照点击次数降序排序，打上一个组内的行号
	// 接着在外层查询中，过滤出各个组内的行号排名前3的数据
	// 其实就是咱们的各个区域下top3热门商品
	
	// 华北、华东、华南、华中、西北、西南、东北
	// A级：华北、华东
	// B级：华南、华中
	// C级：西北、西南
	// D级：东北
	
	// case when
	// 根据多个条件，不同的条件对应不同的值
	// case when then ... when then ... else ... end
	
	String sql = 
			"SELECT "
				+ "area,"
				+ "CASE "
					+ "WHEN area='China North' OR area='China East' THEN 'A Level' "
					+ "WHEN area='China South' OR area='China Middle' THEN 'B Level' "
					+ "WHEN area='West North' OR area='West South' THEN 'C Level' "
					+ "ELSE 'D Level' "
				+ "END area_level,"
				+ "product_id,"
				+ "click_count,"
				+ "city_infos,"
				+ "product_name,"
				+ "product_status "
			+ "FROM ("
				+ "SELECT "
					+ "area,"
					+ "product_id,"
					+ "click_count,"
					+ "city_infos,"
					+ "product_name,"
					+ "product_status,"
					+ "row_number() OVER (PARTITION BY area ORDER BY click_count DESC) rank "
				+ "FROM tmp_area_fullprod_click_count "
			+ ") t "
			+ "WHERE rank<=3";
	
	Dataset<Row> df = sqlContext.sql(sql);
	
	return df.javaRDD();
}

Example 17

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

4 votes

/**
	 * 生成区域商品点击次数临时表（包含了商品的完整信息）
	 * @param sqlContext
	 */
	private static void generateTempAreaFullProductClickCountTable(SQLContext sqlContext) {
		// 将之前得到的各区域各商品点击次数表，product_id
		// 去关联商品信息表，product_id，product_name和product_status
		// product_status要特殊处理，0，1，分别代表了自营和第三方的商品，放在了一个json串里面
		// get_json_object()函数，可以从json串中获取指定的字段的值
		// if()函数，判断，如果product_status是0，那么就是自营商品；如果是1，那么就是第三方商品
		// area, product_id, click_count, city_infos, product_name, product_status
		
		// 为什么要费时费力，计算出来商品经营类型
		// 你拿到到了某个区域top3热门的商品，那么其实这个商品是自营的，还是第三方的
		// 其实是很重要的一件事
		
		// 技术点：内置if函数的使用
		
		String sql = 
				"SELECT "
					+ "tapcc.area,"
					+ "tapcc.product_id,"
					+ "tapcc.click_count,"
					+ "tapcc.city_infos,"
					+ "pi.product_name,"
					+ "if(get_json_object(pi.extend_info,'product_status')='0','Self','Third Party') product_status "
				+ "FROM tmp_area_product_click_count tapcc "
				+ "JOIN product_info pi ON tapcc.product_id=pi.product_id ";
		
//		JavaRDD<Row> rdd = sqlContext.sql("select * from product_info").javaRDD();
//		JavaRDD<Row> flattedRDD = rdd.flatMap(new FlatMapFunction<Row, Row>() {
//
//			private static final long serialVersionUID = 1L;
//
//			@Override
//			public Iterable<Row> call(Row row) throws Exception {
//				List<Row> list = new ArrayList<Row>();
//				
//				for(int i = 0; i < 10; i ++) {
//					Long productid = row.getLong(0);
//					String _productid = i + "_" + productid;
//					
//					Row _row = RowFactory.create(_productid, row.get(1), row.get(2));
//					list.add(_row);
//				}
//				
//				return list;
//			}
//			
//		});
//		
//		StructType _schema = DataTypes.createStructType(Arrays.asList(
//				DataTypes.createStructField("product_id", DataTypes.StringType, true),
//				DataTypes.createStructField("product_name", DataTypes.StringType, true),
//				DataTypes.createStructField("product_status", DataTypes.StringType, true)));
//		
//		Dataset<Row> _df = sqlContext.createDataset<Row>(flattedRDD, _schema);
//		_df.registerTempTable("tmp_product_info");  
//		
//		String _sql = 
//				"SELECT "
//					+ "tapcc.area,"
//					+ "remove_random_prefix(tapcc.product_id) product_id," 
//					+ "tapcc.click_count,"
//					+ "tapcc.city_infos,"
//					+ "pi.product_name,"
//					+ "if(get_json_object(pi.extend_info,'product_status')=0,'自营商品','第三方商品') product_status "
//				+ "FROM ("
//					+ "SELECT "
//						+ "area,"
//						+ "random_prefix(product_id, 10) product_id,"
//						+ "click_count,"
//						+ "city_infos "
//					+ "FROM tmp_area_product_click_count "
//				+ ") tapcc "
//				+ "JOIN tmp_product_info pi ON tapcc.product_id=pi.product_id ";
		
		Dataset<Row> df = sqlContext.sql(sql);
		
		System.out.println("tmp_area_fullprod_click_count: " + df.count());  
		
		df.registerTempTable("tmp_area_fullprod_click_count");   
	}

Example 18

Source File: AreaTop3ProductSpark.java From BigDataPlatform with GNU General Public License v3.0

4 votes

/**
	 * 生成各区域各商品点击次数临时表
	 * @param sqlContext
	 */
	private static void generateTempAreaPrdocutClickCountTable(
			SQLContext sqlContext) {
		// 按照area和product_id两个字段进行分组
		// 计算出各区域各商品的点击次数
		// 可以获取到每个area下的每个product_id的城市信息拼接起来的串
		String sql = 
				"SELECT "
					+ "area,"
					+ "product_id,"
					+ "count(*) click_count, "  
					+ "group_concat_distinct(concat_Long_string(city_id,city_name,':')) city_infos "
				+ "FROM tmp_click_product_basic "
				+ "GROUP BY area,product_id ";
		
		/**
		 * 双重group by
		 */
		
//		String _sql = 
//				"SELECT "
//					+ "product_id_area,"
//					+ "count(click_count) click_count,"
//					+ "group_concat_distinct(city_infos) city_infos "
//				+ "FROM ( "
//					+ "SELECT "
//						+ "remove_random_prefix(product_id_area) product_id_area,"
//						+ "click_count,"
//						+ "city_infos "
//					+ "FROM ( "
//						+ "SELECT "
//							+ "product_id_area,"
//							+ "count(*) click_count,"
//							+ "group_concat_distinct(concat_Long_string(city_id,city_name,':')) city_infos "
//						+ "FROM ( "
//							+ "SELECT "  
//								+ "random_prefix(concat_Long_string(product_id,area,':'), 10) product_id_area,"
//								+ "city_id,"
//								+ "city_name "
//							+ "FROM tmp_click_product_basic "
//						+ ") t1 "
//						+ "GROUP BY product_id_area "
//					+ ") t2 "  
//				+ ") t3 "
//				+ "GROUP BY product_id_area ";  
		
		// 使用Spark SQL执行这条SQL语句
		Dataset<Row> df = sqlContext.sql(sql);
		
		System.out.println("tmp_area_product_click_count: " + df.count());  
		
		// 再次将查询出来的数据注册为一个临时表
		// 各区域各商品的点击次数（以及额外的城市列表）
		df.registerTempTable("tmp_area_product_click_count");    
	}

Example 19

Source File: SingleExpressionScript.java From HiveQLUnit with Apache License 2.0

2 votes

/**
 * Runs the hql contained in the constructor given TextResource, treating it as a single
 * expression with no comments.
 *
 * @param sqlContext an SQLContext, as provided by spark through the TestHiveServer TestRule, used to run hql expressions
 */
@Override
public void runScript(SQLContext sqlContext) {
    sqlContext.sql(expression);
}