import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import java.util.ArrayList; import java.util.List; /** * Created by 張燿峰 * 代码显示构造schema * * @author 孤 * @date 2019/3/29 * @Varsion 1.0 */ public class CustomDataFrame { public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local") .appName("spark app") .getOrCreate(); //创建普通的JavaRDD JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD(); //字符串编码的模式 String schema = "name age"; //根据模式的字符串生成模式 List<StructField> structFieldList = new ArrayList<>(); for (String fieldName : schema.split(" ")) { StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true); structFieldList.add(structField); } StructType structType = DataTypes.createStructType(structFieldList); JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() { @Override public Row call(String v1) { String[] attirbutes = v1.split(","); return RowFactory.create(attirbutes[0], attirbutes[1].trim()); } }); //将模式应用于RDD Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType); //创建临时视图 dataset.createOrReplaceTempView("user"); Dataset<Row> result = sparkSession.sql("select * from user"); result.show(); } }