org.apache.spark.sql.SparkSession Java Examples

The following examples show how to use org.apache.spark.sql.SparkSession. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MyVariantDataset.java    From mmtf-spark with Apache License 2.0 8 votes vote down vote up
/**
 * Returns a dataset of missense variations for a list of Uniprot Ids and a MyVariant.info query.
 * See <a href="http://myvariant.info/docs/">query syntax</a>.
 * <p> Example:
 * <pre>
 * String query = "clinvar.rcv.clinical_significance:pathogenic " 
 *                + "OR clinvar.rcv.clinical_significance:likely pathogenic";
 * </pre>
 * 
 * @param uniprotIds list of Uniprot Ids
 * @param query MyVariant.info query string
 * @return dataset with variation Ids and Uniprot Ids or null if no data are found
 * @throws IOException
 */
public static Dataset<Row> getVariations(List<String> uniprotIds, String query) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(uniprotIds).flatMap(m -> getData(m, query));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING());

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData);

    // return null if dataset contains no results
    if (!Arrays.asList(dataset.columns()).contains("hits")) {
        System.out.println("MyVariantDataset: no matches found");
        return null;
    }

    return flattenDataset(dataset);
}
 
Example #2
Source File: GraphLoader.java    From tutorials with MIT License 6 votes vote down vote up
public GraphFrame getGraphFrameUserRelationship() throws IOException {
    Path temp = Files.createTempDirectory("sparkGraphFrames");
    SparkSession session = SparkSession.builder()
        .appName("SparkGraphFrameSample")
        .config("spark.sql.warehouse.dir", temp.toString())
        .sparkContext(getSparkContext().sc())
        .master("local[*]")
        .getOrCreate();
    List<User> users = loadUsers();

    Dataset<Row> userDataset = session.createDataFrame(users, User.class);

    List<Relationship> relationshipsList = getRelations();
    Dataset<Row> relationshipDataset = session.createDataFrame(relationshipsList, Relationship.class);

    GraphFrame graphFrame = new GraphFrame(userDataset, relationshipDataset);

    return graphFrame;
}
 
Example #3
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test16(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T16: Non overwrite mode " + containerOut);
  String o1 = containerOut + "myData/123";
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField("NAME", DataTypes.StringType, false),
          DataTypes.createStructField("STRING_VALUE", DataTypes.StringType, false),
          DataTypes.createStructField("NUM_VALUE", DataTypes.IntegerType, false), });
  Row r1 = RowFactory.create("name1", "value1", 1);
  Row r2 = RowFactory.create("name2", "value2", 2);
  List<Row> rowList = ImmutableList.of(r1, r2);
  Dataset<Row> rows = spark.createDataFrame(rowList, schema);
  try {
    if (type.equals(Constants.PARQUET_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).parquet(o1);
    } else if (type.equals(Constants.JSON_TYPE)) {
      rows.write().mode(SaveMode.Overwrite).json(o1);
    }
  } catch (Exception e) {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), dataCreate);
  }
}
 
Example #4
Source File: global.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder().master("local")
            .appName("Java Spark SQL")
            .getOrCreate();

    Dataset<Row> dataset = sparkSession.read().json("URL");
    try {
        //创建全局临时视图
        dataset.createGlobalTempView("user");
        //全局临时视图绑定到系统保存的数据库“global_temp”
        Dataset<Row> globalUser = sparkSession.sql("SELECT * FROM global_temp.user");
        sparkSession.newSession().sql("SELECT * FROM global_temp.user");
    } catch (AnalysisException e) {
        e.printStackTrace();
    }
}
 
Example #5
Source File: TestSuite.java    From stocator with Apache License 2.0 6 votes vote down vote up
public void test14(SparkSession spark, Dataset<Row> schemaFlights, String containerOut, String type)
    throws Exception {
  System.out.println("*********************************");
  System.out.println("T14: Append mode " + containerOut);
  String o1 = containerOut + "myData";
  try {
    createAppendObject("T14 - first append", schemaFlights, o1, type);
    long baseCount = schemaFlights.count();
    System.out
        .println("***T14-1 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-1-" + type, type, o1, spark, baseCount, 1);
    createAppendObject("T14 - second append", schemaFlights, o1, type);
    baseCount = schemaFlights.count();
    System.out
        .println("***T14-2 : Reading " + o1 + " from " + containerOut + ", base unit " + baseCount + " type " + type);
    readAndTest("T14-2-" + type, type, o1, spark, baseCount, 2);
  } catch (Exception e) {
    throw e;
  } finally {
    deleteData(o1, spark.sparkContext().hadoopConfiguration(), true);
  }
}
 
Example #6
Source File: AutomatedTestBase.java    From systemds with Apache License 2.0 6 votes vote down vote up
/**
 * Create a SystemDS-preferred Spark Session.
 *
 * @param appName the application name
 * @param master the master value (ie, "local", etc)
 * @return Spark Session
 */
public static SparkSession createSystemDSSparkSession(String appName, String master) {
	Builder builder = SparkSession.builder();
	if (appName != null) {
		builder.appName(appName);
	}
	if (master != null) {
		builder.master(master);
	}
	builder.config("spark.driver.maxResultSize", "0");
	if (SparkExecutionContext.FAIR_SCHEDULER_MODE) {
		builder.config("spark.scheduler.mode", "FAIR");
	}
	builder.config("spark.locality.wait", "5s");
	SparkSession spark = builder.getOrCreate();
	return spark;
}
 
Example #7
Source File: BooksCsvToDataset.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName("Book CSV to Dataset")
      .master("local").getOrCreate();

  String filename = "data/books.csv";
  // @formatter:off
Dataset<Row> df = spark
		.read()
		.format("csv")
		.option("inferSchema", "false") // We are not inferring the schema for now
		.option("header", "true")
		.load(filename);
// @formatter:on
  df.show();

  // In this case everything is a string
  df.printSchema();
}
 
Example #8
Source File: JavaSparkSQLExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) throws AnalysisException {
  // $example on:init_session$
  SparkSession spark = SparkSession
    .builder()
    .appName("Java Spark SQL basic example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate();
  // $example off:init_session$

  runBasicDataFrameExample(spark);
  runDatasetCreationExample(spark);
  runInferSchemaExample(spark);
  runProgrammaticSchemaExample(spark);

  spark.stop();
}
 
Example #9
Source File: RDDConverterUtilsExtTest.java    From systemds with Apache License 2.0 6 votes vote down vote up
@Test
public void testStringDataFrameToVectorDataFrameNull() {
	List<String> list = new ArrayList<>();
	list.add("[1.2, 3.4]");
	list.add(null);
	JavaRDD<String> javaRddString = sc.parallelize(list);
	JavaRDD<Row> javaRddRow = javaRddString.map(new StringToRow());
	SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
	List<StructField> fields = new ArrayList<>();
	fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
	StructType schema = DataTypes.createStructType(fields);
	Dataset<Row> inDF = sparkSession.createDataFrame(javaRddRow, schema);
	Dataset<Row> outDF = RDDConverterUtilsExt.stringDataFrameToVectorDataFrame(sparkSession, inDF);

	List<String> expectedResults = new ArrayList<>();
	expectedResults.add("[[1.2,3.4]]");
	expectedResults.add("[null]");

	List<Row> outputList = outDF.collectAsList();
	for (Row row : outputList) {
		assertTrue("Expected results don't contain: " + row, expectedResults.contains(row.toString()));
	}
}
 
Example #10
Source File: JavaIgniteDataFrameExample.java    From ignite with Apache License 2.0 6 votes vote down vote up
/** */
private static void nativeSparkSqlExample(SparkSession spark) {
    System.out.println("Querying using Spark SQL.");

    Dataset<Row> df = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE()) //Data source type.
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person") //Table to read.
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG) //Ignite config.
            .load();

    //Registering DataFrame as Spark view.
    df.createOrReplaceTempView("person");

    //Selecting data from Ignite through Spark SQL Engine.
    Dataset<Row> igniteDF = spark.sql("SELECT * FROM person WHERE id >= 2 AND name = 'Mary Major'");

    System.out.println("Result schema:");

    igniteDF.printSchema(); //Printing query schema to console.

    System.out.println("Result content:");

    igniteDF.show(); //Printing query results to console.
}
 
Example #11
Source File: BasicExternalUdfFromTextFile.java    From net.jgp.labs.spark with Apache License 2.0 6 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
      .master("local").getOrCreate();

  spark.udf().register("x2Multiplier", new Multiplier2(),
      DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
      .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
      DataTypes.IntegerType)));
  df.show();
}
 
Example #12
Source File: DefaultMetricCollector.java    From ExecDashboard with Apache License 2.0 6 votes vote down vote up
private List<String> getCollectorItemListForLobs(List<Lob> lobList, SparkSession sparkSession, JavaSparkContext javaSparkContext) {
    dashboardCollectorItemsMap
            = DashBoardCollectorItemMapBuilder.getDashboardNameCollectorItemsMapById(getCollectorType(), sparkSession, javaSparkContext);

    List<String> collectorItemList = new ArrayList<>();
    Optional.ofNullable(lobList).orElseGet(Collections::emptyList).stream()
            .map(Lob::getProducts)
            .forEach(products -> products.stream()
                    .map(Product::getProductComponentList)
                    .forEach(productComponents -> productComponents
                            .stream()
                            .map(ProductComponent::getProductComponentDashboardId)
                            .filter(Objects::nonNull)
                            .<List<String>>map(dashboardId -> dashboardCollectorItemsMap.get(dashboardId.toString()) != null ? dashboardCollectorItemsMap.get(dashboardId.toString()) : new ArrayList<>())
                            .forEach(collectorItemList::addAll)));
    return collectorItemList;
}
 
Example #13
Source File: JavaShakespeare.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
    SparkSession spark = SparkSession.builder()
            .appName("spark-bigquery-demo")
            .getOrCreate();

    // Use the Cloud Storage bucket for temporary BigQuery export data used
    // by the connector. This assumes the Cloud Storage connector for
    // Hadoop is configured.
    String bucket = spark.sparkContext().hadoopConfiguration().get("fs.gs.system.bucket");
    spark.conf().set("temporaryGcsBucket", bucket);

    // Load data in from BigQuery.
    Dataset<Row> wordsDF = spark.read().format("bigquery")
            .option("table", "bigquery-public-data.samples.shakespeare").load().cache();
    wordsDF.show();
    wordsDF.printSchema();
    wordsDF.createOrReplaceTempView("words");

    // Perform word count.
    Dataset<Row> wordCountDF = spark.sql(
            "SELECT word, SUM(word_count) AS word_count FROM words GROUP BY word");

    // Saving the data to BigQuery
    wordCountDF.write().format("bigquery").option("table", "wordcount_dataset.wordcount_output")
            .save();
}
 
Example #14
Source File: CustomReportService.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a dataset with the specified columns for all current PDB entries.
 * See <a href="https://www.rcsb.org/pdb/results/reportField.do"> for list
 * of supported field names</a>
 * 
 * @param columnNames
 *            names of the columns for the dataset
 * @return dataset with the specified columns
 * @throws IOException
 *             when temporary csv file cannot be created
 */
public static Dataset<Row> getDataset(String... columnNames) throws IOException {
	// form query URL
	String query = CURRENT_URL + columNamesString(columnNames);

	// run tabular report query
	InputStream input = postQuery(query);

	// save as a temporary CSV file
	Path tempFile = saveTempFile(input);

	SparkSession spark = SparkSession.builder().getOrCreate();

	// load temporary CSV file into Spark dataset
	Dataset<Row> dataset = readCsv(spark, tempFile.toString());

	return concatIds(spark, dataset, columnNames);
}
 
Example #15
Source File: ExternalTableIT.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Test
public void testParquetColumnName() throws Exception {
    String tablePath = getExternalResourceDirectory()+"parquet_colname";
    methodWatcher.execute(String.format("create external table t_parquet (col1 int, col2 varchar(5))" +
            " STORED AS PARQUET LOCATION '%s'", tablePath));
    methodWatcher.execute("insert into t_parquet values (1, 'A')");
    SparkSession spark = SparkSession.builder()
            .master("local")
            .appName("ExternaltableIT")
            .getOrCreate();

    Dataset dataset = spark
            .read()
            .parquet(tablePath);
    String actual = dataset.schema().toString();
    String expected = "StructType(StructField(COL1,IntegerType,true), StructField(COL2,StringType,true))";
    Assert.assertEquals(actual, expected, actual);
}
 
Example #16
Source File: JavaWord2VecExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaWord2VecExample")
    .getOrCreate();

  // $example on$
  // Input data: Each row is a bag of words from a sentence or document.
  List<Row> data = Arrays.asList(
    RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
    RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
    RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
  });
  Dataset<Row> documentDF = spark.createDataFrame(data, schema);

  // Learn a mapping from words to Vectors.
  Word2Vec word2Vec = new Word2Vec()
    .setInputCol("text")
    .setOutputCol("result")
    .setVectorSize(3)
    .setMinCount(0);

  Word2VecModel model = word2Vec.fit(documentDF);
  Dataset<Row> result = model.transform(documentDF);

  for (Row row : result.collectAsList()) {
    List<String> text = row.getList(0);
    Vector vector = (Vector) row.get(1);
    System.out.println("Text: " + text + " => \nVector: " + vector + "\n");
  }
  // $example off$

  spark.stop();
}
 
Example #17
Source File: SparkFactoryImpl.java    From beakerx with Apache License 2.0 5 votes vote down vote up
private Optional<SparkSessionBuilder> getBuilderFromUser(Object result) {
  if (result instanceof SparkConf) {
    return of(sparkSessionBuilderFactory.newInstance((SparkConf) result));
  } else if (result instanceof SparkSession.Builder) {
    return of(sparkSessionBuilderFactory.newInstance((SparkSession.Builder) result));
  } else {
    return Optional.empty();
  }
}
 
Example #18
Source File: MLContextUtil.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * Check that the Spark version is supported. If it isn't supported, throw
 * an MLContextException.
 *
 * @param spark
 *            SparkSession
 * @throws MLContextException
 *             thrown if Spark version isn't supported
 */
public static void verifySparkVersionSupported(SparkSession spark) {
	String minimumRecommendedSparkVersion = null;
	try {
		// If this is being called using the SystemDS jar file,
		// ProjectInfo should be available.
		ProjectInfo projectInfo = ProjectInfo.getProjectInfo();
		minimumRecommendedSparkVersion = projectInfo.minimumRecommendedSparkVersion();
	} catch (MLContextException e) {
		try {
			// During development (such as in an IDE), there is no jar file
			// typically
			// built, so attempt to obtain the minimum recommended Spark
			// version from
			// the pom.xml file
			minimumRecommendedSparkVersion = getMinimumRecommendedSparkVersionFromPom();
		} catch (MLContextException e1) {
			throw new MLContextException(
					"Minimum recommended Spark version could not be determined from SystemDS jar file manifest or pom.xml");
		}
	}
	String sparkVersion = spark.version();
	if (!MLContextUtil.isSparkVersionSupported(sparkVersion, minimumRecommendedSparkVersion)) {
		throw new MLContextException(
				"Spark " + sparkVersion + " or greater is recommended for this version of SystemDS.");
	}
}
 
Example #19
Source File: DataPreview.java    From StockPrediction with MIT License 5 votes vote down vote up
public static void main (String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate();
    String filename = "prices-split-adjusted.csv";
    String symbol = "GOOG";
    // load data from csv file
    Dataset<Row> data = spark.read().format("csv").option("header", true)
            .load(new ClassPathResource(filename).getFile().getAbsolutePath())
            //.filter(functions.col("symbol").equalTo(symbol))
            //.drop("date").drop("symbol")
            .withColumn("openPrice", functions.col("open").cast("double")).drop("open")
            .withColumn("closePrice", functions.col("close").cast("double")).drop("close")
            .withColumn("lowPrice", functions.col("low").cast("double")).drop("low")
            .withColumn("highPrice", functions.col("high").cast("double")).drop("high")
            .withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume")
            .toDF("date", "symbol", "open", "close", "low", "high", "volume");

    data.show();

    Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count"));
    System.out.println("Number of Symbols: " + symbols.count());
    symbols.show();

    VectorAssembler assembler = new VectorAssembler()
            .setInputCols(new String[] {"open", "low", "high", "volume", "close"})
            .setOutputCol("features");

    data = assembler.transform(data).drop("open", "low", "high", "volume", "close");

    data = new MinMaxScaler().setMin(0).setMax(1)
            .setInputCol("features").setOutputCol("normalizedFeatures")
            .fit(data).transform(data)
            .drop("features").toDF("features");
}
 
Example #20
Source File: MLContextMultipleScriptsTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
private static void runMLContextTestMultipleScript(ExecMode platform, boolean wRead) 
{
	ExecMode oldplatform = DMLScript.getGlobalExecMode();
	DMLScript.setGlobalExecMode(platform);
	
	//create mlcontext
	SparkSession spark = createSystemDSSparkSession("MLContextMultipleScriptsTest", "local");
	MLContext ml = new MLContext(spark);
	ml.setExplain(true);

	String dml1 = baseDirectory + File.separator + "MultiScript1.dml";
	String dml2 = baseDirectory + File.separator + (wRead?"MultiScript2b.dml":"MultiScript2.dml");
	String dml3 = baseDirectory + File.separator + (wRead?"MultiScript3b.dml":"MultiScript3.dml");
	
	try
	{
		//run script 1
		Script script1 = dmlFromFile(dml1).in("$rows", rows).in("$cols", cols).out("X");
		Matrix X = ml.execute(script1).getMatrix("X");
		
		Script script2 = dmlFromFile(dml2).in("X", X).out("Y");
		Matrix Y = ml.execute(script2).getMatrix("Y");
		
		Script script3 = dmlFromFile(dml3).in("X", X).in("Y",Y).out("z");
		String z = ml.execute(script3).getString("z");
		
		System.out.println(z);
	}
	finally {
		DMLScript.setGlobalExecMode(oldplatform);
		
		// stop underlying spark context to allow single jvm tests (otherwise the
		// next test that tries to create a SparkContext would fail)
		spark.stop();
		// clear status mlcontext and spark exec context
		ml.close();
	}
}
 
Example #21
Source File: JavaIgniteDataFrameWriteExample.java    From ignite with Apache License 2.0 5 votes vote down vote up
/** */
private static void editDataAndSaveToNewTable(Ignite ignite, SparkSession spark) {
    //Load content of Ignite table to data frame.
    Dataset<Row> personDataFrame = spark.read()
            .format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "person")
            .load();

    System.out.println("Data frame content:");

    //Printing content of data frame to console.
    personDataFrame.show();

    System.out.println("Modifying Data Frame and write it to Ignite:");

    personDataFrame
            .withColumn("id", col("id").plus(42)) //Edit id column
            .withColumn("name", reverse(col("name"))) //Edit name column
            .write().format(IgniteDataFrameSettings.FORMAT_IGNITE())
            .option(IgniteDataFrameSettings.OPTION_CONFIG_FILE(), CONFIG)
            .option(IgniteDataFrameSettings.OPTION_TABLE(), "new_persons")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS(), "id, city_id")
            .option(IgniteDataFrameSettings.OPTION_CREATE_TABLE_PARAMETERS(), "backups=1")
            .mode(SaveMode.Overwrite) //Overwriting entire table.
            .save();

    System.out.println("Done!");

    System.out.println("Reading data from Ignite table:");

    CacheConfiguration<?, ?> ccfg = new CacheConfiguration<>(CACHE_NAME);

    IgniteCache<?, ?> cache = ignite.getOrCreateCache(ccfg);

    //Reading saved data from Ignite.
    List<List<?>> data = cache.query(new SqlFieldsQuery("SELECT id, name, city_id FROM new_persons")).getAll();

    System.out.println(data);
}
 
Example #22
Source File: JavaTC.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}
 
Example #23
Source File: DrugBankDataset.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Reads CSV file into a Spark dataset
 * 
 * @param fileName
 * @throws IOException
 */
private static Dataset<Row> readCsv(String inputFileName) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();

    Dataset<Row> dataset = spark.read().format("csv").option("header", "true").option("inferSchema", "true")
            .load(inputFileName);

    return dataset;
}
 
Example #24
Source File: DefaultMetricCollector.java    From ExecDashboard with Apache License 2.0 5 votes vote down vote up
public void collect(SparkSession sparkSession, JavaSparkContext javaSparkContext, List<?> objectList) {
    if ((sparkSession == null) || (javaSparkContext == null) || CollectionUtils.isEmpty(objectList)) { return; }

    if (objectList.get(0) instanceof Portfolio){
        collectPortFolioMetrics(sparkSession, javaSparkContext, (List<Portfolio>) objectList);
        return;
    }
    if (objectList.get(0) instanceof Lob){
        collectLobMetrics(sparkSession, javaSparkContext, (List<Lob>) objectList);
        return;
    }
}
 
Example #25
Source File: TextFileToDataset2.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("Dataset from Text File")
      .master("local[*]")
      .getOrCreate();

  String filename = "data/simple-data-file.txt";
  Dataset<Row> df = spark.read().text(filename);
  df.show();
}
 
Example #26
Source File: DefaultDataCollector.java    From ExecDashboard with Apache License 2.0 5 votes vote down vote up
DefaultDataCollector(String collectionName, String query, List<String> collectorItemIds, SparkSession sparkSession, JavaSparkContext javaSparkContext, PortfolioCollectorSetting portfolioCollectorSetting) {
    this.collectionName = collectionName;
    this.query = query;
    this.collectorItemIds = collectorItemIds;
    this.sparkSession = sparkSession;
    this.javaSparkContext = javaSparkContext;
    this.portfolioCollectorSetting = portfolioCollectorSetting;
}
 
Example #27
Source File: JavaLDAExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  // Creates a SparkSession
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaLDAExample")
    .getOrCreate();

  // $example on$
  // Loads data.
  Dataset<Row> dataset = spark.read().format("libsvm")
    .load("data/mllib/sample_lda_libsvm_data.txt");

  // Trains a LDA model.
  LDA lda = new LDA().setK(10).setMaxIter(10);
  LDAModel model = lda.fit(dataset);

  double ll = model.logLikelihood(dataset);
  double lp = model.logPerplexity(dataset);
  System.out.println("The lower bound on the log likelihood of the entire corpus: " + ll);
  System.out.println("The upper bound bound on perplexity: " + lp);

  // Describe topics.
  Dataset<Row> topics = model.describeTopics(3);
  System.out.println("The topics described by their top-weighted terms:");
  topics.show(false);

  // Shows the result.
  Dataset<Row> transformed = model.transform(dataset);
  transformed.show(false);
  // $example off$

  spark.stop();
}
 
Example #28
Source File: PdbMetadataDemo.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws IOException {
 SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbMetadataDemo.class.getSimpleName())
            .getOrCreate();

 // query the following fields from the _citation category using PDBj's Mine2 web service:
 // journal_abbrev, pdbx_database_id_PubMed, year.   
 // Note, mixed case column names must be quoted and escaped with \".
 String sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'";
 Dataset<Row>ds = PdbjMineDataset.getDataset(sqlQuery);
 
 System.out.println("First 10 results from query: " + sqlQuery);
 ds.show(10, false);
  
 // filter out unpublished entries (they contain the word "published" in various upper/lower case combinations)
 ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'");
 
 // print the top 10 journals
 System.out.println("Top 10 journals that publish PDB structures:");
 ds.groupBy("journal_abbrev").count().sort(col("count").desc()).show(10, false);
	
 // filter out entries without a PubMed Id (is -1 if PubMed Id is not available)
 ds = ds.filter("pdbx_database_id_PubMed > 0");
 System.out.println("Entries with PubMed Ids: " + ds.count());
 
 // show growth of papers in PubMed
 System.out.println("PubMed Ids per year: ");
 ds.groupBy("year").count().sort(col("year").desc()).show(10, false);

 spark.close();
}
 
Example #29
Source File: Bundles.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Extracts the given resource type from the RDD of bundles and returns
 * it as a Dataset of that type.
 *
 * @param spark the spark session
 * @param bundles an RDD of FHIR Bundles
 * @param resourceClass the type of resource to extract.
 * @return a dataset of the given resource
 */
public Dataset<Row> extractEntry(SparkSession spark,
    JavaRDD<BundleContainer> bundles,
    Class resourceClass) {

  RuntimeResourceDefinition definition = FhirContexts.contextFor(fhirVersion)
      .getResourceDefinition(resourceClass);

  return extractEntry(spark, bundles, definition.getName());
}
 
Example #30
Source File: ConceptMaps.java    From bunsen with Apache License 2.0 5 votes vote down vote up
/**
 * Returns an empty ConceptMaps instance.
 *
 * @param spark the spark session
 * @return an empty ConceptMaps instance.
 */
public static ConceptMaps getEmpty(SparkSession spark) {

  Dataset<ConceptMap> emptyConceptMaps = spark.emptyDataset(CONCEPT_MAP_ENCODER)
      .withColumn("timestamp", lit(null).cast("timestamp"))
      .as(CONCEPT_MAP_ENCODER);

  return new ConceptMaps(spark,
      spark.emptyDataset(URL_AND_VERSION_ENCODER),
      emptyConceptMaps,
      spark.emptyDataset(MAPPING_ENCODER));
}