Java Code Examples for org.apache.spark.sql.SparkSession#sql()

The following examples show how to use org.apache.spark.sql.SparkSession#sql() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SparkSqlApplication.java From sparkResearch with Apache License 2.0

6 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();

    Dataset<Row> dataset = sparkSession.read().json("D:\\a.txt");
    //只返回name字段
    dataset.select("name").show();
    //返回两个字段,所有age的value+1
    dataset.select(col("name"),col("age").plus(1)).show();
    //选择age大于21岁的人
    dataset.filter(col("age").gt(21)).show();
    //分组聚合,group age
    dataset.groupBy("age").count().show();
    //显示
    dataset.show();


    /*以编程的方式运行SQL查询*/
    //注册临时表
    dataset.createOrReplaceTempView("user");
    Dataset<Row> users = sparkSession.sql("SELECT * FROM user");

    JavaRDD<Object> toText = users.toJavaRDD().map((Function<Row, Object>) v1 -> v1.getString(0));
    users.show();
}

Example 2

Source File: SqlQueryBasedTransformer.java From hudi with Apache License 2.0

6 votes

@Override
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset<Row> rowDataset,
    TypedProperties properties) {
  String transformerSQL = properties.getString(Config.TRANSFORMER_SQL);
  if (null == transformerSQL) {
    throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")");
  }

  // tmp table name doesn't like dashes
  String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
  LOG.info("Registering tmp table : " + tmpTable);
  rowDataset.registerTempTable(tmpTable);
  String sqlStr = transformerSQL.replaceAll(SRC_PATTERN, tmpTable);
  LOG.info("SQL Query for transformation : (" + sqlStr + ")");
  return sparkSession.sql(sqlStr);
}

Example 3

Source File: TestSuite.java From stocator with Apache License 2.0

6 votes

public void test3(SparkSession spark, Dataset<Row> schemaFlights, String outCSV1) throws Exception {
  try {
    System.out.println("*********************************");
    System.out.println("T3: Run SQL query on the dataset");
    schemaFlights.createOrReplaceTempView("flights");
    Dataset<Row> results = spark.sql(
        "select year, month, count(*) as Count  from flights WHERE cancellationCode like 'B' GROUP BY year, month ORDER By year, month");
    System.out.println("*********************************");
    System.out.println("T3: Save query result as CSV object: ");
    System.out.println("T3 " + outCSV1);
    results.write().mode("overwrite").format("com.databricks.spark.csv").save(outCSV1);
    // read and compare
    System.out.println("*********************************");
    System.out.println("T3: Read query CSV object and compare");
    JavaRDD<String> csvReadBack = spark.read().textFile(outCSV1).javaRDD();
    countAndCompare(results, csvReadBack.count(), "T3");
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(outCSV1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }

}

Example 4

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runBasicDataSourceExample(SparkSession spark) {
  // $example on:generic_load_save_functions$
  Dataset<Row> usersDF = spark.read().load(Constant.LOCAL_FILE_PREX +"/data/resources/users.parquet");
  usersDF.select("name", "favorite_color").write().save("namesAndFavColors.parquet");
  // $example off:generic_load_save_functions$
  // $example on:manual_load_options$
  Dataset<Row> peopleDF =
    spark.read().format("json").load(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");
  peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet");
  // $example off:manual_load_options$
  // $example on:direct_sql$
  Dataset<Row> sqlDF =
    spark.sql("SELECT * FROM parquet.`data/resources/users.parquet`");
  // $example off:direct_sql$
}

Example 5

Source File: CustomReportService.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Concatenates structureId and chainId fields into a single key if chainId
 * field is present
 * 
 * @param spark
 * @param dataset
 * @return
 */
private static Dataset<Row> concatIds(SparkSession spark, Dataset<Row> dataset, String[] columnNames) {
	if (Arrays.asList(dataset.columns()).contains("chainId")) {
		dataset.createOrReplaceTempView("table");

		String sql = "SELECT CONCAT(structureId,'.',chainId) as structureChainId," + "structureId,chainId,"
				+ columNamesString(columnNames) + " from table";
		dataset = spark.sql(sql);
	}
	return dataset;
}

Example 6

Source File: SequenceNgrammer.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Splits a one-letter sequence column (e.g., protein sequence)
 * into array of non-overlapping n-grams. To generate all possible n-grams,
 * this method needs to be called n times with shift parameters {0, ..., n-1}.
 * 
 * <p> Example 3-gram(shift=0) : IDCGHTVEDQR ... => [IDC, GHT, VED, ...]
 * <p> Example 3-gram(shift=1) : IDCGHTVEDQR ... => [DCG, HTV, EDQ, ...]
 * <p> Example 3-gram(shift=2) : IDCGHTVEDQR ... => [CGH, TVE, DQR, ...]
 * 
 * <p>For an application of shifted n-grams see:
 * E Asgari, MRK Mofrad, PLoS One. 2015; 10(11): e0141287, doi: 
 * <a href="https://dx.doi.org/10.1371/journal.pone.0141287">10.1371/journal.pone.0141287</a>
    *
    * @param data input dataset with column "sequence"
    * @param n size of the n-gram
    * @param shift start index for the n-gram
    * @param outputCol name of the output column
    * @return output dataset with appended ngram column
    */
public static Dataset<Row> shiftedNgram(Dataset<Row> data, int n, int shift, String outputCol) {
	SparkSession session = data.sparkSession();

	session.udf().register("encoder", new UDF1<String, String[]>() {
		private static final long serialVersionUID = 4844644794982507954L;

		@Override
		public String[] call(String s) throws Exception {
			if (shift > s.length()) {
				return new String[0];
			}
			s = s.substring(shift);
			int t = s.length() / n;
			
			String[] ngram = new String[t];

			for (int i = 0, j = 0; j < t; i += n) {
				ngram[j++] = s.substring(i, i + n);
			}
			return ngram;
		}
	}, DataTypes.createArrayType(DataTypes.StringType));

	data.createOrReplaceTempView("table");
	
	// append shifted ngram column
	return session.sql("SELECT *, encoder(sequence) AS " + outputCol + " from table");
}

Example 7

Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0

5 votes

private static Dataset<Row> averageFeatureVectors(Dataset<Row> data, String outputCol) {
	SparkSession session = data.sparkSession();

	session.udf().register("averager", new UDF3<Vector, Vector, Vector, Vector>() {
		private static final long serialVersionUID = -8190379199020903671L;

		@Override
		public Vector call(Vector v1, Vector v2, Vector v3) throws Exception {
			double[] f1 = v1.toArray();
			double[] f2 = v2.toArray();
			double[] f3 = v3.toArray();
			
			// arrays may be of different length
			int len = Math.min(Math.min(f1.length, f2.length), f3.length);
			double[] average = new double[len];

			for (int i = 0; i < len; i++) {
				average[i] = (f1[i] + f2[i] + f3[i]) / 3.0;
			}
			return Vectors.dense(average);
		}
	}, new VectorUDT());

	data.createOrReplaceTempView("table");
	// append new feature column with average values
	return session.sql("SELECT *, averager(features0,features1,features2) AS " + outputCol + " from table");
}

Example 8

Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Encodes a protein sequence by 7 physicochemical
 * properties. 
 * 
 * <p> See:  Meiler, J., Müller, M., Zeidler, A. et al. J Mol Model (2001) 7: 360. doi:
 * <a href="https://link.springer.com/article/10.1007/s008940100038">10.1007/s008940100038</a>
    *
 * @return dataset with feature vector appended
 */
public Dataset<Row> propertyEncode() {
	SparkSession session = data.sparkSession();
    int maxLength = getMaxSequenceLength(data);

	session.udf().register("encoder", new UDF1<String, Vector>(){
		private static final long serialVersionUID = 1L;

		@Override
		public Vector call(String s) throws Exception {
               double[] values = new double[7*maxLength];
			for (int i = 0, k = 0; i < s.length(); i++) {
				double[] property = properties.get(s.charAt(i));
				if (property != null) {
					for (double p: property) {
						values[k++] = p;
					}
				}	
			}
			return Vectors.dense(values);
		}
	}, new VectorUDT());

	// append feature column
			data.createOrReplaceTempView("table");
			data = session.sql("SELECT *, encoder(" 
			+ inputCol + ") AS " 
					+ outputCol + " from table");
			
			return data;
}

Example 9

Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * One-hot encodes a protein sequence. The one-hot encoding
    * encodes the 20 natural amino acids, plus X for any other 
    * residue for a total of 21 elements per residue.
 * 
 * @return dataset with feature vector appended
 */
public Dataset<Row> oneHotEncode() {
	SparkSession session = data.sparkSession();
	int maxLength = getMaxSequenceLength(data);

	session.udf().register("encoder", new UDF1<String, Vector>() {
		private static final long serialVersionUID = -6095318836772114908L;

		@Override
		public Vector call(String s) throws Exception {
			int len = AMINO_ACIDS21.size();
               double[] values = new double[len * maxLength];
			char[] seq = s.toCharArray();
			for (int i = 0; i < seq.length; i++) {
				int index = AMINO_ACIDS21.indexOf(seq[i]);
				// replace any non-matching code, e.g., U, with X
				if (index == -1) {
					index = AMINO_ACIDS21.indexOf('X');
				}
				values[i * len + index] = 1;
			}

			return Vectors.dense(values);
		}
	}, new VectorUDT());

	// append feature column
	data.createOrReplaceTempView("table");
	data = session.sql("SELECT *, encoder(" 
	+ inputCol + ") AS " 
			+ outputCol + " from table");
	
	return data;
}

Example 10

Source File: SparkSessionHeloWorld.java From Apache-Spark-2x-for-Java-Developers with MIT License

5 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkSession sparkSession = SparkSession.builder()
			.master("local")
			.appName("CSV Read Example")
			.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse")
			.getOrCreate();
	
	Dataset<Row> csv = sparkSession.read().format("com.databricks.spark.csv").option("header","true")
			.load("C:\\Users\\sgulati\\Documents\\my_docs\\book\\testdata\\emp.csv");
	
	csv.createOrReplaceTempView("test");
	Dataset<Row> sql = sparkSession.sql("select * from test");
	sql.collectAsList();
}

Example 11

Source File: FlatteningTransformer.java From hudi with Apache License 2.0

5 votes

/**
 * Configs supported.
 */
@Override
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession, Dataset<Row> rowDataset,
    TypedProperties properties) {

  // tmp table name doesn't like dashes
  String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
  LOG.info("Registering tmp table : " + tmpTable);
  rowDataset.registerTempTable(tmpTable);
  return sparkSession.sql("select " + flattenSchema(rowDataset.schema(), null) + " from " + tmpTable);
}

Example 12

Source File: SparkStreamingSqlAnalyse.java From sylph with Apache License 2.0

5 votes

/**
 * 预编译sql 而不是等到运行时，才发现错误
 * Precompiled sql instead of waiting for the runtime to find the error
 */
private static void checkDStream(
        SparkSession spark,
        String sourceTableName,
        StructType sourceSchema,
        List<Consumer<SparkSession>> handlers
)
{
    RDD<Row> rdd = spark.sparkContext().<Row>emptyRDD(ClassTag$.MODULE$.<Row>apply(Row.class));
    Dataset<Row> df = spark.createDataFrame(rdd, sourceSchema);
    df.createOrReplaceTempView(sourceTableName);
    handlers.forEach(x -> x.accept(spark));
    spark.sql("drop view " + sourceTableName);
}

Example 13

Source File: PortfolioCollector.java From ExecDashboard with Apache License 2.0

5 votes

public void collectCMDB(SparkSession sparkSession, JavaSparkContext javaSparkContext) {
    LOGGER.info("##### Begin: collectCMDB #####");
    if ((sparkSession == null) || (javaSparkContext == null)) {
        return;
    }

    DataFrameLoader.loadDataFrame("cmdb", javaSparkContext);
    DataFrameLoader.loadDataFrame("dashboards", javaSparkContext);

    // Unique list of products per businessOwner
    Dataset<Row> productRows = sparkSession.sql(HygieiaSparkQuery.CMDB_PRODUCT_QUERY);
    productRows.groupBy("businessOwner", "productName");

    Dataset<Row> lobRows = sparkSession.sql(HygieiaSparkQuery.CMDB_PRODUCT_QUERY);
    lobRows.groupBy( "ownerDept");

    // Unique list of Environments
    Dataset<Row> environmentRows = sparkSession.sql(HygieiaSparkQuery.CMDB_ENVIRONMENT_QUERY);
    environmentRows.groupBy("configurationItem");

    // Unique list of components
    Dataset<Row> componentRows = sparkSession.sql(HygieiaSparkQuery.CMDB_COMPONENT_QUERY);
    componentRows.groupBy("configurationItem");

    // Unique list of dashboards, no groupBy needed here looks like ...
    Dataset<Row> dashboardRows = sparkSession.sql(HygieiaSparkQuery.DASHBOARD_QUERY_EXPLODE);

    productRowsList = productRows.collectAsList();
    lobRowsList = lobRows.collectAsList();
    environmentRowsList = environmentRows.collectAsList();
    componentRowsList = componentRows.collectAsList();
    dashboardRowsList = dashboardRows.collectAsList();
    List<Portfolio> portfolioList = createPortfolios();
    portfolioRepositoryThumbnail.deleteAll();
    getPortfolioWithThumbnails(portfolioList);
    portfolioRepository.deleteAll();
    portfolioRepository.save(portfolioList);
    LOGGER.info("##### End: collectCMDB #####");
}

Example 14

Source File: HiveDataSource.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
	
	/*
	 * 0.把hive里面的hive-site.xml放到spark/conf目录下
	 * 1.启动Mysql
	 * 2.启动HDFS
	 * 3.启动Hive ./hive
	 * 4.初始化HiveContext
	 * 5.打包运行
	 * 
	 * ./bin/spark-submit --master yarn-cluster --class com.huangyueran.spark.sql.HiveDataSource /root/spark_hive_datasource.jar
	 * ./bin/spark-submit --master yarn-client --class com.huangyueran.spark.sql.HiveDataSource /root/spark_hive_datasource.jar 
	 */
	
	JavaSparkContext sc = SparkUtils.getRemoteSparkContext(HiveDataSource.class);
	// 创建HiveContext，注意，这里，它接收的是SparkContext作为参数，不是JavaSparkContext，其实也可以使用JavaSparkContext，只不过内部也是做了sc.sc()的操作
       // HiveContext hiveContext = new HiveContext(sc.sc()); // 已过时 官方建议使用SparkSession
	SparkSession sparkSession = new SparkSession(sc.sc());
	Dataset<Row> person = sparkSession.sql("show databases");
       person.show();
       
       List<Row> list = person.javaRDD().collect();
       System.out.println("=============================================================");
       for(Row r:list){
       	System.out.println(r);
       }
       System.out.println("=============================================================");
       
	sc.close();
}

Example 15

Source File: CustomDataFrame.java From sparkResearch with Apache License 2.0

5 votes

public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local")
            .appName("spark app")
            .getOrCreate();

    //创建普通的JavaRDD
    JavaRDD<String> javaRDD = sparkSession.sparkContext().textFile("URL", 1).toJavaRDD();
    //字符串编码的模式
    String schema = "name age";

    //根据模式的字符串生成模式
    List<StructField> structFieldList = new ArrayList<>();
    for (String fieldName : schema.split(" ")) {
        StructField structField = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
        structFieldList.add(structField);
    }
    StructType structType = DataTypes.createStructType(structFieldList);

    JavaRDD<Row> rowJavaRDD = javaRDD.map(new Function<String, Row>() {
        @Override
        public Row call(String v1) {
            String[] attirbutes = v1.split(",");
            return RowFactory.create(attirbutes[0], attirbutes[1].trim());
        }
    });

    //将模式应用于RDD
    Dataset<Row> dataset = sparkSession.createDataFrame(rowJavaRDD, structType);

    //创建临时视图
    dataset.createOrReplaceTempView("user");
    Dataset<Row> result = sparkSession.sql("select * from user");
    result.show();
}

Example 16

Source File: AbstractConceptMaps.java From bunsen with Apache License 2.0

4 votes

/**
 * Creates a table of mapping records partitioned by conceptmapuri and
 * conceptmapversion.
 *
 * @param spark the spark session
 * @param tableName the name of the mapping table
 * @param location the location to store the table, or null to create a Hive-managed table.
 * @throws IllegalArgumentException if the table name or location are malformed.
 */
private static void createMappingTable(SparkSession spark,
    String tableName,
    String location) {

  if (!TABLE_NAME_PATTERN.matcher(tableName).matches()) {
    throw new IllegalArgumentException("Invalid table name: " + tableName);
  }

  // Hive will check for well-formed paths, so we just ensure
  // a user isn't attempting to inject additional SQL into the statement.
  if (location != null && location.contains(";")) {
    throw new IllegalArgumentException("Invalid path for mapping table: "
        + location);
  }

  StringBuilder builder = new StringBuilder();

  if (location != null) {

    builder.append("CREATE EXTERNAL TABLE IF NOT EXISTS ");

  } else {

    builder.append("CREATE TABLE IF NOT EXISTS ");
  }

  builder.append(tableName);

  // Note the partitioned by columns are deliberately lower case here,
  // since Spark does not appear to match columns to
  // Hive partitions if they are not.
  builder.append("(sourceValueSet STRING, "
      + "targetValueSet STRING, "
      + "sourceSystem STRING, "
      + "sourceValue STRING, "
      + "targetSystem STRING, "
      + "targetValue STRING, "
      + "equivalence STRING)\n"
      + "PARTITIONED BY (conceptmapuri STRING, conceptmapversion STRING)\n");

  builder.append("STORED AS PARQUET\n");

  if (location != null) {
    builder.append("LOCATION '")
        .append(location)
        .append("'");
  }

  spark.sql(builder.toString());
}

Example 17

Source File: JavaSparkHiveExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on:spark_hive$
  // warehouseLocation points to the default location for managed databases and tables
  String warehouseLocation = "spark-warehouse";
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark Hive Example")
    .config("spark.sql.warehouse.dir", warehouseLocation)
    .enableHiveSupport()
    .getOrCreate();

  spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)");
  spark.sql("LOAD DATA LOCAL INPATH 'data/resources/kv1.txt' INTO TABLE src");

  // Queries are expressed in HiveQL
  spark.sql("SELECT * FROM src").show();
  // +---+-------+
  // |key|  value|
  // +---+-------+
  // |238|val_238|
  // | 86| val_86|
  // |311|val_311|
  // ...

  // Aggregation queries are also supported.
  spark.sql("SELECT COUNT(*) FROM src").show();
  // +--------+
  // |count(1)|
  // +--------+
  // |    500 |
  // +--------+

  // The results of SQL queries are themselves DataFrames and support all normal functions.
  Dataset<Row> sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key");

  // The items in DaraFrames are of type Row, which lets you to access each column by ordinal.
  Dataset<String> stringsDS = sqlDF.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Key: " + row.get(0) + ", Value: " + row.get(1);
    }
  }, Encoders.STRING());
  stringsDS.show();
  // +--------------------+
  // |               value|
  // +--------------------+
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // |Key: 0, Value: val_0|
  // ...

  // You can also use DataFrames to create temporary views within a SparkSession.
  List<Record> records = new ArrayList<>();
  for (int key = 1; key < 100; key++) {
    Record record = new Record();
    record.setKey(key);
    record.setValue("val_" + key);
    records.add(record);
  }
  Dataset<Row> recordsDF = spark.createDataFrame(records, Record.class);
  recordsDF.createOrReplaceTempView("records");

  // Queries can then join DataFrames data with data stored in Hive.
  spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show();
  // +---+------+---+------+
  // |key| value|key| value|
  // +---+------+---+------+
  // |  2| val_2|  2| val_2|
  // |  2| val_2|  2| val_2|
  // |  4| val_4|  4| val_4|
  // ...
  // $example off:spark_hive$

  spark.stop();
}

Example 18

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runBasicDataFrameExample(SparkSession spark) throws AnalysisException {
  // $example on:create_df$
  Dataset<Row> df = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // Displays the content of the DataFrame to stdout
  df.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:create_df$

  // $example on:untyped_ops$
  // Print the schema in a tree format
  df.printSchema();
  // root
  // |-- age: long (nullable = true)
  // |-- name: string (nullable = true)

  // Select only the "name" column
  df.select("name").show();
  // +-------+
  // |   name|
  // +-------+
  // |Michael|
  // |   Andy|
  // | Justin|
  // +-------+

  // Select everybody, but increment the age by 1
  df.select(col("name"), col("age").plus(1)).show();
  // +-------+---------+
  // |   name|(age + 1)|
  // +-------+---------+
  // |Michael|     null|
  // |   Andy|       31|
  // | Justin|       20|
  // +-------+---------+

  // Select people older than 21
  df.filter(col("age").gt(21)).show();
  // +---+----+
  // |age|name|
  // +---+----+
  // | 30|Andy|
  // +---+----+

  // Count people by age
  df.groupBy("age").count().show();
  // +----+-----+
  // | age|count|
  // +----+-----+
  // |  19|    1|
  // |null|    1|
  // |  30|    1|
  // +----+-----+
  // $example off:untyped_ops$

  // $example on:run_sql$
  // Register the DataFrame as a SQL temporary view
  df.createOrReplaceTempView("people");

  Dataset<Row> sqlDF = spark.sql("SELECT * FROM people");
  sqlDF.show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:run_sql$

  // $example on:global_temp_view$
  // Register the DataFrame as a global temporary view
  df.createGlobalTempView("people");

  // Global temporary view is tied to a system preserved database `global_temp`
  spark.sql("SELECT * FROM global_temp.people").show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+

  // Global temporary view is cross-session
  spark.newSession().sql("SELECT * FROM global_temp.people").show();
  // +----+-------+
  // | age|   name|
  // +----+-------+
  // |null|Michael|
  // |  30|   Andy|
  // |  19| Justin|
  // +----+-------+
  // $example off:global_temp_view$
}

Example 19

Source File: JavaSparkSQLExample.java From SparkDemo with MIT License

4 votes

private static void runProgrammaticSchemaExample(SparkSession spark) {
  // $example on:programmatic_schema$
  // Create an RDD
  JavaRDD<String> peopleRDD = spark.sparkContext()
    .textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt", 1)
    .toJavaRDD();

  // The schema is encoded in a string
  String schemaString = "name age";

  // Generate the schema based on the string of schema
  List<StructField> fields = new ArrayList<>();
  for (String fieldName : schemaString.split(" ")) {
    StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
    fields.add(field);
  }
  StructType schema = DataTypes.createStructType(fields);

  // Convert records of the RDD (people) to Rows
  JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
    @Override
    public Row call(String record) throws Exception {
      String[] attributes = record.split(",");
      return RowFactory.create(attributes[0], attributes[1].trim());
    }
  });

  // Apply the schema to the RDD
  Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);

  // Creates a temporary view using the DataFrame
  peopleDataFrame.createOrReplaceTempView("people");

  // SQL can be run over a temporary view created using DataFrames
  Dataset<Row> results = spark.sql("SELECT name FROM people");

  // The results of SQL queries are DataFrames and support all the normal RDD operations
  // The columns of a row in the result can be accessed by field index or by field name
  Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
    @Override
    public String call(Row row) throws Exception {
      return "Name: " + row.getString(0);
    }
  }, Encoders.STRING());
  namesDS.show();
  // +-------------+
  // |        value|
  // +-------------+
  // |Name: Michael|
  // |   Name: Andy|
  // | Name: Justin|
  // +-------------+
  // $example off:programmatic_schema$
}

Example 20

Source File: Bundles.java From bunsen with Apache License 2.0

3 votes

/**
 * Saves an RDD of bundles as a database, where each table
 * has the resource name. This offers a simple way to load and query
 * bundles in a system, although users with more sophisticated ETL
 * operations may want to explicitly write different entities.
 *
 * <p>
 * Note this will access the given RDD of bundles once per resource name,
 * so consumers with enough memory should consider calling
 * {@link JavaRDD#cache()} so that RDD is not recomputed for each.
 * </p>
 *
 * @param spark the spark session
 * @param bundles an RDD of FHIR Bundles
 * @param database the name of the database to write to
 * @param resourceNames names of resources to be extracted from the bundle and written
 */
public void saveAsDatabase(SparkSession spark,
    JavaRDD<BundleContainer> bundles,
    String database,
    String... resourceNames) {

  spark.sql("create database if not exists " + database);

  for (String resourceName : resourceNames) {

    Dataset ds = extractEntry(spark, bundles, resourceName);

    ds.write().saveAsTable(database + "." + resourceName.toLowerCase());
  }
}