Java Code Examples for org.apache.spark.api.java.JavaSparkContext#close()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkWordCount.java    From Apache-Spark-2x-for-Java-Developers with MIT License 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
Example 2
Source File: Union.java    From SparkDemo with MIT License 6 votes vote down vote up
static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD,不去重,要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}
 
Example 3
Source File: FilterByRFree.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	    
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
		 
	    // here the methods are chained together
	    long count = MmtfReader
	    		.readSequenceFile(path, sc)
	    		.filter(new Resolution(0.0, 2.0))
	    		.count();
	    
	    System.out.println("# structures: " + count);
	    
	    sc.close();
	}
 
Example 4
Source File: PolyPeptideChainStatistics.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		JavaDoubleRDD chainLengths = MmtfReader
				.readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file															
				.flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains
				.filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids
				.mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression

		System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:");
		System.out.println(chainLengths.stats());

		sc.close();
	}
 
Example 5
Source File: ReadMmtfFull.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {  
    long start = System.nanoTime();
    
    // instantiate Spark. Each Spark application needs these two lines of code.
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfFull.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read all PDB entries from a local Hadoop sequence file
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readFullSequenceFile(sc);

    System.out.println("# structures: " + pdb.count());
    
    // close Spark
    sc.close();
    
    long end = System.nanoTime();
    System.out.println((end-start)/1E9 + " sec."); 
}
 
Example 6
Source File: FilterExclusivelyByLProteins.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	    
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterExclusivelyByLProteins.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    		 
	    boolean exclusive = true;
	    
	    long count = MmtfReader
	    		.readSequenceFile(path, sc) // read MMTF hadoop sequence file
	    		// retain pdb entries that exclusively (flag set to true) contain L-protein chains
	    		.filter(new ContainsLProteinChain(exclusive)) 
	    		.count();
	    
	    System.out.println("# L-proteins: " + count);
	    sc.close();
	}
 
Example 7
Source File: SpecifyFormatLoadSave.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("SpecifyFormatLoadSave").setMaster("local");
    JavaSparkContext sc = new JavaSparkContext(conf);

    // 创建DataFrame 读取json
    SQLContext sqlContext = new SQLContext(sc);

    DataFrameReader dataFrameReader = sqlContext.read();

    // parquet 是本地数据存储的格式
    Dataset<Row> dataset = dataFrameReader.format("json").load(Constant.LOCAL_FILE_PREX + "/data/resources/people.json");

    // 通过关writer写入并保存save
    DataFrameWriter write = dataset.select("name").write();
    write.format("parquet").save("tmp/people.parquet");

    sc.close();
}
 
Example 8
Source File: MapToBioJava.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(MapToBioJava.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
	    
	    long count = MmtfReader
	    		.readReducedSequenceFile(sc) // read MMTF-Hadoop sequence file
	    		.flatMapToPair(new StructureToPolymerChains())
	    		.mapValues(new StructureToBioJava())
	    		.count();
	    
	    System.out.println("Number of polymer chains: " + count);
	    
	    sc.close();
	}
 
Example 9
Source File: BuildDataFrameFromScratch2.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Build a DataFrame from Scratch").master("local[*]")
      .getOrCreate();

  List<String[]> stringAsList = new ArrayList<>();
  stringAsList.add(new String[] { "bar1.1", "bar2.1" });
  stringAsList.add(new String[] { "bar1.2", "bar2.2" });

  JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((
      String[] row) -> RowFactory.create(row));

  // Creates schema
  StructType schema = DataTypes
      .createStructType(new StructField[] { DataTypes.createStructField(
          "foe1", DataTypes.StringType, false),
          DataTypes.createStructField("foe2", DataTypes.StringType, false) });

  Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();

  log.debug("** Schema: ");
  df.printSchema();

  log.debug("** Data: ");
  df.show();

  sparkContext.close();
}
 
Example 10
Source File: BuildDataFrameFromScratch.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
private void start() {
  SparkSession spark = SparkSession.builder().appName(
      "Build a DataFrame from Scratch").master("local[*]")
      .getOrCreate();

  List<String> stringAsList = new ArrayList<>();
  stringAsList.add("bar");

  JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((
      String row) -> RowFactory.create(row));

  // Creates schema
  StructType schema = DataTypes.createStructType(
      new StructField[] { DataTypes.createStructField("foe",
          DataTypes.StringType, false) });

  Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();

  log.debug("** Schema: ");
  df.printSchema();

  log.debug("** Data: ");
  df.show();

  sparkContext.close();
}
 
Example 11
Source File: MmcifToMmtfFull.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
    * Converts a directory containing .cif files into an MMTF-Hadoop Sequence file.
    * The input directory is traversed recursively to find PDB files.
    * 
    * @param args args[0] <input-path-to-cif_files>, args[1] <output-path-to-mmtf-hadoop-file>
    * 
    * @throws FileNotFoundException
    */
public static void main(String[] args) throws FileNotFoundException {  
	
    if (args.length != 2) {
        System.out.println("Usage: MmcifToMmtfFull <input-path-to-cif_files> <output-path-to-mmtf-hadoop-file>");
    }
    
    // path to input directory
    String cifPath = args[0];
    
    // path to output directory
    String mmtfPath = args[1];

    // instantiate Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("MmcifToMmtfFull");
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read cif files recursively starting from the specified top level directory
    JavaPairRDD<String, StructureDataInterface> structures = MmtfImporter.importMmcifFiles(cifPath, sc);
    
    // save as an MMTF-Hadoop Sequence File
    MmtfWriter.writeSequenceFile(mmtfPath, sc, structures);
    
    System.out.println(structures.count() + " structures written to: " + mmtfPath);
    
    // close Spark
    sc.close(); 
}
 
Example 12
Source File: DataFrameOperation.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
	SparkConf conf = new SparkConf().setAppName("DataFrameOperation").setMaster("local");
	JavaSparkContext sc = new JavaSparkContext(conf);

	SQLContext sqlContext = new SQLContext(sc);

	// 将数据源读取为数据框,可以理解为一张表。具有数据和结构信息
	Dataset<Row> dataset = sqlContext.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

	// 格式化的打印这张表
	dataset.show();

	// 搭建元数据(结构)schema
	dataset.printSchema();

	// 查询列并计算
	dataset.select("name").show();
	dataset.select(dataset.col("name"), dataset.col("age").plus(1)).show();

	// 过滤
	dataset.filter(dataset.col("age").gt(20)).show();

	// 根据某一列分组然后统计count
	dataset.groupBy("age").count().show();

	sc.close();
}
 
Example 13
Source File: WriteMmtfCustom.java    From mmtf-spark with Apache License 2.0 5 votes vote down vote up
/**
 * @param args
 * @throws FileNotFoundException 
 */
public static void main(String[] args) throws FileNotFoundException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read a 20% random sample of the PDB
    double fraction = 0.2;
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc);

    // retain high resolution X-ray structures
    pdb = pdb
    		.filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION))
    		.filter(new Resolution(0, 2.0))
    		.filter(new Rfree(0, 0.2));
   
    // coalesce this into 8 partitions to avoid creating many small files
    pdb = pdb.coalesce(8);
    
    // save this subset in a Hadoop Sequence file
    MmtfWriter.writeSequenceFile(path +"_xray", sc, pdb);
    
    System.out.println("# structures in custom set: " + pdb.count());
  
    long end = System.nanoTime();
    
    System.out.println("Time: " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 14
Source File: JDBCDataSource.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
//		SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
		JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class);
		SQLContext sqlContext = new SQLContext(sc);

		Map<String, String> options = new HashMap<String, String>();
		options.put("url", "jdbc:mysql://192.168.2.129:3306/hive");
		options.put("dbtable", "t_user");
		options.put("user", "root");
		options.put("password", "666666");

		// 加载jdbc数据配置信息 并不会立即连接数据库
		Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load();

		//		options.put("dbtable", "tb_item");
		//		DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load();

		// 读取jdbc表数据
		dataset1.javaRDD().foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row row) throws Exception {
				System.out.println(row);
			}
		});


		// 将RDD数据存储到MYSQL中
		saveToMysql( sqlContext, options);

		sc.close();
	}
 
Example 15
Source File: WildTypeQuery.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WildTypeQuery.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);

	boolean includeExpressionTags = true;
	int sequenceCoverage = 95;

	long count = MmtfReader.readReducedSequenceFile(sc)
			.filter(new WildType(includeExpressionTags, sequenceCoverage)).count();

	System.out.println(count);

	sc.close();
}
 
Example 16
Source File: SecondaryStructureElementDemo.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();

    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    List<String> pdbIds = Arrays.asList("1STP"); // single protein chain
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadReducedMmtfFiles(pdbIds, sc).cache();
    
    pdb = pdb
    		.flatMapToPair(new StructureToPolymerChains())
    		.filter(new ContainsLProteinChain());
    
    Dataset<Row> ds = SecondaryStructureElementExtractor.getDataset(pdb, "E", 6);

    // show the top 50 rows of this dataset
    ds.show(50, false);

    long end = System.nanoTime();
    
    System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end-start) + " sec.");
    
    sc.close();
}
 
Example 17
Source File: InteractionAnalysisAdvanced.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args no input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(InteractionAnalysisAdvanced.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // get non-redundant subset
    pdb = pdb.filter(new Pisces(40, 2.5));
    
    // find Zinc interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ZN", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(20);

       // TODO print the top 10 interacting elements
       System.out.println("Top interacting elements");
       Dataset<Row> topElements = interactions
       		.filter("element2 != 'C'") // exclude carbon interactions
       		.groupBy("element2")
       		.count();
       
       topElements.withColumn("frequency", col("count").divide(n))
       .filter("frequency > 0.01") // filter out occurrences < 1 %
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       interactions
       .groupBy("element2")
       .avg("distance")
       .sort("avg(distance)")
       .show(10);

       // Aggregate multiple statistics
       // Note: import static org.apache.spark.sql.functions.* required!
       // e.g. org.apache.spark.sql.functions.avg
       // for a list of all available functions
       interactions
       .groupBy("element2")
       .agg(count("distance"),avg("distance"),min("distance"),max("distance"),kurtosis("distance"))
       .show(10);
       
       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}
 
Example 18
Source File: FilterByReleaseDate.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws FileNotFoundException {

		String path = MmtfReader.getMmtfReducedPath();
	   
	    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByReleaseDate.class.getSimpleName());
	    JavaSparkContext sc = new JavaSparkContext(conf);
		 
	    long count = MmtfReader
	    		.readSequenceFile(path, sc)
	    		.filter(new ReleaseDate("2000-01-28","2017-02-28"))
	    		.count();
	    
	    System.out.println("Structures released between 2000-01-28 and 2017-02-28 " + count);
	    
	    sc.close();
	}
 
Example 19
Source File: SparkGraphXKickoff.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 4 votes vote down vote up
public static void main(String[] args) throws InterruptedException {

        System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

        final SparkConf conf = new SparkConf()
            .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
            .setAppName(APPLICATION_NAME);

        JavaSparkContext javaSparkContext = new JavaSparkContext(conf);        
        
        List<Tuple2<Object, String>> listOfVertex = new ArrayList<>();
        listOfVertex.add(new Tuple2<>(1l, "James"));
        listOfVertex.add(new Tuple2<>(2l, "Andy"));
        listOfVertex.add(new Tuple2<>(3l, "Ed"));
        listOfVertex.add(new Tuple2<>(4l, "Roger"));
        listOfVertex.add(new Tuple2<>(5l, "Tony"));

        List<Edge<String>> listOfEdge = new ArrayList<>();
        listOfEdge.add(new Edge<>(2, 1, "Friend"));
        listOfEdge.add(new Edge<>(3, 1, "Friend"));
        listOfEdge.add(new Edge<>(3, 2, "Colleague"));    
        listOfEdge.add(new Edge<>(3, 5, "Partner"));
        listOfEdge.add(new Edge<>(4, 3, "Boss"));        
        listOfEdge.add(new Edge<>(5, 2, "Partner"));       
    
        JavaRDD<Tuple2<Object, String>> vertexRDD = javaSparkContext.parallelize(listOfVertex);
        JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(listOfEdge);

        ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
		
        Graph<String, String> graph = Graph.apply(
            vertexRDD.rdd(), 
            edgeRDD.rdd(), 
            "", 
            StorageLevel.MEMORY_ONLY(), 
			StorageLevel.MEMORY_ONLY(), 
			stringTag, 
			stringTag
            );    

        //apply specific algorithms, such as PageRank

        graph.vertices()
            .saveAsTextFile(VERTICES_FOLDER_PATH);        
			 
        graph.edges()
	    .saveAsTextFile(EDGES_FOLDER_PATH);        

        javaSparkContext.close();
    }
 
Example 20
Source File: ReducedRedundancyLocatorExampleMain.java    From s3-inventory-usage-examples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}