Java Code Examples for org.apache.spark.api.java.JavaSparkContext.textFile()

The following are Jave code examples for showing how to use textFile() of the org.apache.spark.api.java.JavaSparkContext class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: Apache-Spark-2x-for-Java-Developers   File: S3Example.java   Source Code and License Vote up 17 votes
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
		JavaSparkContext jsc=new JavaSparkContext(conf);
		//jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId");
		//jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey");
		
		
		System.out.println(System.getenv("AWS_ACCESS_KEY_ID"));
		JavaRDD<String> textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv");
		
//		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt");
		
		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt");
	}
 
Example 2
Project: big-data-benchmark   File: SparkWordCount.java   Source Code and License Vote up 12 votes
public static void main(String[] args) {
    if (args.length != 2) {
        System.err.println("Usage:");
        System.err.println("  SparkWordCount <sourceFile> <targetFile>");
        System.exit(1);
    }

    SparkConf conf = new SparkConf()
            .setAppName("Word Count");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textFile = sc.textFile(args[0]);
    JavaRDD<String> words = textFile.flatMap(LineIterator::new);
    JavaPairRDD<String, Long> pairs =
            words.mapToPair(s -> new Tuple2<>(s, 1L));
    JavaPairRDD<String, Long> counts =
            pairs.reduceByKey((Function2<Long, Long, Long>) (a, b) -> a + b);

    System.out.println("Starting task..");
    long t = System.currentTimeMillis();
    counts.saveAsTextFile(args[1] + "_" + t);
    System.out.println("Time=" + (System.currentTimeMillis() - t));
}
 
Example 3
Project: stonk   File: SparkHDFSTest.java   Source Code and License Vote up 8 votes
@Test
public void test() {
    String hdfsPath = "hdfs://10.196.83.90:9000/stonk/spark/aa/spark-task--aa-b5x59zpv/out3";

    SparkConf conf = new SparkConf().setAppName("111").setMaster("local[3]");
    JavaSparkContext context = new JavaSparkContext(conf);
    JavaRDD<String> rdd = context.textFile(hdfsPath);
    rdd.foreach((str) -> System.out.println(str));
}
 
Example 4
Project: Apache-Spark-2x-for-Java-Developers   File: WordCount.java   Source Code and License Vote up 7 votes
public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}
 
Example 5
Project: Apache-Spark-2x-for-Java-Developers   File: SparkWordCount.java   Source Code and License Vote up 7 votes
public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}
 
Example 6
Project: ViraPipe   File: SplitFasta.java   Source Code and License Vote up 6 votes
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}
 
Example 7
Project: Apache-Spark-2x-for-Java-Developers   File: LFSExample.java   Source Code and License Vote up 5 votes
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("Local File System Example");
		
		
		JavaSparkContext jsc=new JavaSparkContext(conf);
	//	jsc.hadoopConfiguration().setLong("dfs.block.size",20000);
	  
	//	jsc.hadoopConfiguration().setLong("fs.local.block.size",20000);
		  
//		
//		JavaRDD<String> localFile=jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt");
//		localFile.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path");
		
		
//		JavaRDD<String> localFile1 = jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//		
//		System.out.println(localFile1.getNumPartitions());
//		localFile1.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path1");
		
		JavaRDD<String> localFile2 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*");
		System.out.println(localFile2.getNumPartitions());
//		localFile2.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path2");
////	   
//        JavaRDD<String> localFile3 =jsc.textFile("C:\\Users\\sgulati\\Documents\\Result\\test\\*,C:\\Users\\sgulati\\Documents\\Result\\test5\\*");
//		
//        localFile3.flatMap(x -> Arrays.asList(x.split(" ")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("C:\\Users\\sgulati\\Documents\\Result\\out_path3");
//        
//        JavaPairRDD<String, String> localFileWhole = jsc.wholeTextFiles("C:\\Users\\sgulati\\Documents\\Result\\test\\a.txt,C:\\Users\\sgulati\\Documents\\Result\\test\\b.txt");
//        System.out.println(localFileWhole.collect());
        
		jsc.close();
		
	}
 
Example 8
Project: metadata-qa-marc   File: ParallelValidator.java   Source Code and License Vote up 5 votes
public static void main(String[] args) throws ParseException {

		final Validator validator = new Validator(args);
		ValidatorParameters params = validator.getParameters();
		validator.setDoPrintInProcessRecord(false);

		logger.info("Input file is " + params.getArgs());
		SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
		JavaSparkContext context = new JavaSparkContext(conf);

		System.err.println(validator.getParameters().formatParameters());

		JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

		JavaRDD<String> baseCountsRDD = inputFile
			.flatMap(content -> {
				MarcReader reader = ReadMarc.getMarcStringReader(content);
				Record marc4jRecord = reader.next();
				MarcRecord marcRecord = MarcFactory.createFromMarc4j(
					marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
				validator.processRecord(marcRecord, 1);
				return ValidationErrorFormatter
					.formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
					.iterator();
			}
		);
		baseCountsRDD.saveAsTextFile(validator.getParameters().getFileName());
	}
 
Example 9
Project: oryx2   File: ALSUpdate.java   Source Code and License Vote up 5 votes
private static JavaPairRDD<String,float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
  log.info("Loading features RDD from {}", path);
  JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
  return featureLines.mapToPair(line -> {
    List<?> update = TextUtils.readJSON(line, List.class);
    String key = update.get(0).toString();
    float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class);
    return new Tuple2<>(key, vector);
  });
}
 
Example 10
Project: ViraPipe   File: RenameContigsUniq.java   Source Code and License Vote up 4 votes
public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", true,"Divide or merge to n partitions" ) );
    options.addOption(new Option( "fa", true, "Include only files with extension given " ));
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        cmd = parser.parse( options, args );
    }
    catch( ParseException exp ) {
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String fastaonly = (cmd.hasOption("fa")==true)? cmd.getOptionValue("fa"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("RenameContigsUniq");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd;
    if(fastaonly!=null)
        rdd = sc.textFile(in+"/*."+fastaonly);
    else
        rdd = sc.textFile(in); //take whole directory as input

    JavaRDD<String> crdd = rdd.filter(f -> f.trim().split("\n")[0].length()!=0).map(fasta->{

        String[] fseq = fasta.trim().split("\n");
        String id = fseq[0].split(" ")[0];

        //Give unique id for sequence
        String seq_id = id+"_"+UUID.randomUUID().toString();
        String seq = Arrays.toString(Arrays.copyOfRange(fseq, 1, fseq.length)).replace(", ","").replace("[","").replace("]","");

        return ">"+seq_id+"\n"+seq;
    });

    if(partitions!=null)
        crdd.repartition(Integer.valueOf(partitions)).saveAsTextFile(out);
    else
        crdd.saveAsTextFile(out);

    sc.stop();
}
 
Example 11
Project: ytk-learn   File: SparkTrainWorker.java   Source Code and License Vote up 4 votes
public static void main(String []args) throws Exception {

        String modelName = args[0];
        String configPath = args[1];
        String configFile = args[2];
        String pyTransformScript = args[3];
        boolean needPyTransform = Boolean.parseBoolean(args[4]);
        String loginName = args[5];
        String hostName = args[6];
        int hostPort = Integer.parseInt(args[7]);
        int slaveNum = Integer.parseInt(args[8]);
        int threadNum = Integer.parseInt(args[9]);

        LOG.info("configFile:" + configFile);
        LOG.info("loginName:" + loginName);
        LOG.info("hostName:" + hostName + ", hostPort:" + hostPort);
        LOG.info("slaveNum:" + slaveNum + ", threadNum:" + threadNum);
        LOG.info("modelName:" + modelName);

        SparkConf conf = new SparkConf();
        SparkTrainWorker worker = new SparkTrainWorker(
                conf,
                modelName,
                configPath,
                configFile,
                pyTransformScript,
                needPyTransform,
                loginName,
                hostName,
                hostPort,
                slaveNum,
                threadNum);
        JavaSparkContext sc = new JavaSparkContext(conf);
        String trainDataPath = worker.getTrainDataPath();
        JavaRDD<String> trainRDD = sc.textFile(trainDataPath);
        LOG.info("trainDataPath:" + trainDataPath);

        if (!worker.sparkTrain(trainRDD)) {
            throw new Exception("spark train exception!");
        }

        System.exit(0);
    }
 
Example 12
Project: Apache-Spark-2x-for-Java-Developers   File: HdfsExample.java   Source Code and License Vote up 3 votes
public static void main(String[] args) {
	
	SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
	JavaSparkContext jsc=new JavaSparkContext(conf);
	jsc.hadoopConfiguration().setLong("dfs.blocksize",2);
	//jsc.hadoopConfiguration().setLong("fs.local.block.size",2);
	
	JavaRDD<String> hadoopRdd = jsc.textFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/test1",2);
	
	System.out.println(hadoopRdd.getNumPartitions());
	//hadoopRdd.saveAsTextFile("hdfs://ch3lxesgdi02.corp.equinix.com:8020/user/gse/packt/ch01/testout");
	
	
}