Java Code Examples for org.apache.spark.api.java.JavaSparkContext.close()

The following are Jave code examples for showing how to use close() of the org.apache.spark.api.java.JavaSparkContext class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: Apache-Spark-2x-for-Java-Developers   File: SparkWordCount.java   Source Code and License Vote up 7 votes
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
Example 2
Project: Java-Data-Science-Cookbook   File: ScalaTest.java   Source Code and License Vote up 5 votes
public static void main( String[] args ){
	String inputFile = "data/dummy.txt";
	SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
	JavaSparkContext sparkContext = new JavaSparkContext(configuration);
	JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();

	long numberA = logData.filter(new Function<String,Boolean>(){
		private static final long serialVersionUID = 1L;
		public Boolean call(String s){
			return s.length() == 0;
		}
	}).count();
	sparkContext.close();
	System.out.println("Empty Lines: " + numberA);
}
 
Example 3
Project: Apache-Spark-2x-for-Java-Developers   File: LFSExample.java   Source Code and License Vote up 5 votes
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("Local File System Example");
		
		
		JavaSparkContext jsc=new JavaSparkContext(conf);
	//	jsc.hadoopConfiguration().setLong("dfs.block.size",20000);
	  
	//	jsc.hadoopConfiguration().setLong("fs.local.block.size",20000);
		  
//		
//		JavaRDD<String> localFile=jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt");
//		localFile.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path");
		
		
//		JavaRDD<String> localFile1 = jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//		
//		System.out.println(localFile1.getNumPartitions());
//		localFile1.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path1");
		
		JavaRDD<String> localFile2 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*");
		System.out.println(localFile2.getNumPartitions());
//		localFile2.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path2");
////	   
//        JavaRDD<String> localFile3 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*,C:\\Users\\sgulati\\Documents\\Result\\test5\\*");
//		
//        localFile3.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path3");
//        
//        JavaPairRDD<String, String> localFileWhole = jsc.wholeTextFiles("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//        System.out.println(localFileWhole.collect());
        
		jsc.close();
		
	}
 
Example 4
Project: mutantpdb   File: App.java   Source Code and License Vote up 5 votes
public static void main( String[] args )
{
    Dataset<Row> mutations = DataProvider.getMutationsToStructures();
    List<String> pdbIds = mutations.select(col("pdbId"))
            .distinct().toJavaRDD().map(t -> t.getString(0)).collect();

    List<Row> broadcasted = mutations.select("pdbId", "chainId", "pdbAtomPos").collectAsList();
    SaprkUtils.stopSparkSession();

    JavaSparkContext sc = SaprkUtils.getSparkContext();
    Broadcast<List<Row>> bcmut = sc.broadcast(broadcasted);

    MmtfReader//.readSequenceFile("/pdb/2017/full", pdbIds, sc)
            .downloadMmtfFiles(Arrays.asList("5IRC"), sc)
            .flatMapToPair(new StructureToPolymerChains())
            .flatMapToPair(new AddResidueToKey(bcmut))
            .mapValues(new StructureToBioJava())
            .mapToPair(new FilterResidue())
            .filter(t -> t._2!=null).keys()
            .map(t -> t.replace(".", ","))
            .saveAsTextFile("/Users/yana/git/mutantpdb/src/main/resources/pdb_residues");
    sc.close();
}
 
Example 5
Project: s3-inventory-usage-examples   File: ReducedRedundancyLocatorExampleMain.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}
 
Example 6
Project: hadoop-logfile-inputformat   File: Sample.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
    boolean argumentsValid = parseArguments(args);

    if (!argumentsValid) {
        printHelp();
        System.exit(1);
    }

    final Configuration hadoopConfig = new Configuration(true);
    final FileSystem hdfs = FileSystem.get(hadoopConfig);

    if (hdfs.exists(outputPath)) {
        System.out.printf("output path '%s' already exists in HDFS!%n", outputPath);
        System.exit(1);
    }

    System.out.printf("reading from:     %s%n", inputPath);
    System.out.printf("writing to:       %s%n", outputPath);
    System.out.printf("pattern:          %s%n", pattern.pattern());
    System.out.printf("sample fraction:  %f%n", sampleFraction);
    System.out.printf("...%n");
    System.out.printf("%n");

    SparkConf sparkConfig = new SparkConf().setAppName(String.format("Reading sample (fraction %f) from '%s'", sampleFraction, inputPath));
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConfig);

    LogfileInputFormat.setPattern(hadoopConfig, pattern);

    JavaPairRDD<Tuple2<Path, Long>, Text> rdd = sparkContext.newAPIHadoopFile(
            inputPath.toString(),
            LogfileInputFormat.class,
            LogfileInputFormat.KEY_CLASS,
            Text.class,
            hadoopConfig);

    rdd.sample(false, sampleFraction)
            .map(tuple -> String.format("%[email protected]%016d:%n%n%s%n%n", tuple._1._1.toString(), tuple._1._2, tuple._2.toString()))
            .repartition(1)
            .saveAsTextFile(outputPath.toString());

    sparkContext.close();
}
 
Example 7
Project: hadoop-logfile-inputformat   File: Test.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {

        final Configuration hadoopConfig = new Configuration(true);
        hdfs = FileSystem.get(hadoopConfig);

        if (!parseArguments(args)) {
            printHelp();
            System.exit(1);
        }

        if (hdfs.exists(directory)) {
            if (!hdfs.isDirectory(directory)) {
                System.out.printf("'%s' exists in HDFS, but is not a directory!%n", directory);
                System.exit(1);
            }
            FileStatus[] fileStatus = hdfs.listStatus(directory);
            if (fileStatus.length > 0) {
                System.out.printf("'%s' exists in HDFS, but is not empty!%n", directory);
                System.exit(1);
            }
        }

        createDirectories();

        System.out.printf("Creating test data in '%s'. This may take a while...%n", directory.toString());

        Map<String, LogfileType> logfileTypeByPath = new HashMap<>();

        LogfileSummary summary = writeLogFiles(logfileTypeByPath);

        SparkConf sparkConfig = new SparkConf().setAppName("Testing LogfileInputFormat.");
        JavaSparkContext sparkContext = new JavaSparkContext(sparkConfig);

        logfileTypeByPath.forEach((path, type) -> {
            LogfileInputFormat.setPattern(hadoopConfig, path, type.getFirstlinePattern());
        });
        LogfileInputFormat.setPattern(hadoopConfig, LogfileType.A.getFirstlinePattern());

        JavaPairRDD<Tuple2<Path, Long>, Text> rdd;
        JavaRDD<Tuple2<LocalDateTime, LogLevel>> logRecords;

        rdd = sparkContext.newAPIHadoopFile(logDir + "/*" + FILE_EXT_LOG, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);

        Function<Tuple2<Tuple2<Path, Long>, Text>, Tuple2<LocalDateTime, LogLevel>> mappingFunction = mappingFunction(logfileTypeByPath);

        logRecords = rdd.map(mappingFunction).cache();
        long totalCountLog = logRecords.count();
        long infoCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
        long warnCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
        long errorCountLog = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();

        rdd = sparkContext.newAPIHadoopFile(logDirGz + "/*" + FILE_EXT_GZ, LogfileInputFormat.class, LogfileInputFormat.KEY_CLASS, Text.class, hadoopConfig);

        logRecords = rdd.map(mappingFunction).cache();
        long totalCountGz = logRecords.count();
        long infoCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.INFO).count();
        long warnCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.WARN).count();
        long errorCountGz = logRecords.filter(tuple -> tuple._2 == LogLevel.ERROR).count();

        long totalCountExpected = summary.getRecordCount();
        long infoCountExpected = summary.getRecordCount(LogLevel.INFO);
        long warnCountExpected = summary.getRecordCount(LogLevel.WARN);
        long errorCountExpected = summary.getRecordCount(LogLevel.ERROR);

        System.out.printf("%n%n%n%30s %15s %15s %15s %15s%n%n", "", "expected", "from *.log", "from *.log.gz", "test result");
        System.out.printf("%30s %15d %15d %15d %15s%n", "total # of log records",
                totalCountExpected, totalCountLog, totalCountGz,
                ((totalCountExpected == totalCountLog && totalCountLog == totalCountGz) ? "SUCCESS" : "FAILURE"));
        System.out.printf("%30s %15d %15d %15d %15s%n", "# of INFO level records",
                infoCountExpected, infoCountLog, infoCountGz,
                ((infoCountExpected == infoCountLog && infoCountLog == infoCountGz) ? "SUCCESS" : "FAILURE"));
        System.out.printf("%30s %15d %15d %15d %15s%n", "# of WARN level records",
                warnCountExpected, warnCountLog, warnCountGz,
                ((warnCountExpected == warnCountLog && warnCountLog == warnCountGz) ? "SUCCESS" : "FAILURE"));
        System.out.printf("%30s %15d %15d %15d %15s%n%n%n", "# of ERROR level records",
                errorCountExpected, errorCountLog, errorCountGz,
                ((errorCountExpected == errorCountLog && errorCountLog == errorCountGz) ? "SUCCESS" : "FAILURE"));

        sparkContext.close();
    }