Java Code Examples for org.apache.spark.api.java.JavaSparkContext#textFile()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#textFile() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: WordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}

Example 2

Source File: S3Example.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf =new SparkConf().setMaster("local").setAppName("S3 Example");
		JavaSparkContext jsc=new JavaSparkContext(conf);
		//jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", "Your awsAccessKeyId");
		//jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", "your awsSecretAccessKey");
		
		
		System.out.println(System.getenv("AWS_ACCESS_KEY_ID"));
		JavaRDD<String> textFile = jsc.textFile("s3a://"+"trust"+"/"+"MOCK_DATA.csv");
		
//		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
//		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3n://"+"trust"+"/"+"out.txt");
		
		textFile.flatMap(x -> Arrays.asList(x.split(",")).iterator()).mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
		.reduceByKey((x, y) -> x + y).saveAsTextFile("s3a://"+"trust"+"/"+"out.txt");
	}

Example 3

Source File: SparkWordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}

Example 4

Source File: SplitFasta.java From ViraPipe with MIT License

5 votes

public static void main(String[] args) throws IOException {
    Options options = new Options();
    Option pathOpt = new Option( "in", true, "Path to fastq file in hdfs." );
    Option opOpt = new Option( "out", true, "HDFS path for output files. If not present, the output files are not moved to HDFS." );
    options.addOption(  new Option( "partitions", "Divide or merge to n partitions" ) );
    options.addOption( pathOpt );
    options.addOption( opOpt );

    CommandLineParser parser = new BasicParser();
    CommandLine cmd = null;
    try {
        // parse the command line arguments
        cmd = parser.parse( options, args );

    }
    catch( ParseException exp ) {
        // oops, something went wrong
        System.err.println( "Parsing failed.  Reason: " + exp.getMessage() );
    }

    String out = (cmd.hasOption("out")==true)? cmd.getOptionValue("out"):null;
    String in = (cmd.hasOption("in")==true)? cmd.getOptionValue("in"):null;
    String partitions = (cmd.hasOption("partitions")==true)? cmd.getOptionValue("partitions"):null;

    SparkConf conf = new SparkConf().setAppName("SplitFasta");
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.hadoopConfiguration().set("textinputformat.record.delimiter", ">");

    JavaRDD<String> rdd = sc.textFile(in);
    JavaRDD<String> crdd = rdd.map(v->">"+v.trim()).repartition(Integer.valueOf(partitions));

    crdd.saveAsTextFile(out);
    sc.stop();
}

Example 5

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 元素采样.
 * true 元素可以多次采样
 *
 * @since hui_project 1.0.0
 */
public void testSample() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    //元素可以多次采样
    JavaRDD<String> sample = textRDD
            .sample(true, 0.001, 100);
    checkResult(sample.collect());
}

Example 6

Source File: PersistenceRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 读文件
 *
 * @throws Exception
 */
public void testReadFile() throws Exception {

    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);

    List<String> collect = stringJavaRDD.collect();
    checkResult(collect);
}

Example 7

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 聚合.
 * demo计算目的: 计算每个地铁站名字出现次数
 *
 * @since hui_project 1.0.0
 */
public void testReduceByKey() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaPairRDD<String, Integer> rdd = textRDD
            .map(x -> Arrays.asList(x.split(",")).get(0))
            .mapToPair(x -> new Tuple2<>(x, 1))
            .reduceByKey((x, y) -> x + y);
    checkResult(rdd.collect());

}

Example 8

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 元素转换. 参数->数组参数
 * demo计算目的：获取地铁站信息切分后 获取数组信息1.出发站 2.终点站 3.经历站点数 4.距离
 *
 * @since hui_project 1.0.0
 */
public void testFlatMap() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> textRDD = sparkContext.textFile(FILE_PATH);
    JavaRDD<String> splitRDD = textRDD
            .flatMap(x -> Arrays.asList(x.split(",")).iterator());
    checkResult(splitRDD.collect());
}

Example 9

Source File: JavaALS.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {

    if (args.length < 4) {
      System.err.println(
        "Usage: JavaALS <ratings_file> <rank> <iterations> <output_dir> [<blocks>]");
      System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("JavaALS");
    int rank = Integer.parseInt(args[1]);
    int iterations = Integer.parseInt(args[2]);
    String outputDir = args[3];
    int blocks = -1;
    if (args.length == 5) {
      blocks = Integer.parseInt(args[4]);
    }

    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    JavaRDD<String> lines = sc.textFile(args[0]);

    JavaRDD<Rating> ratings = lines.map(new ParseRating());

    MatrixFactorizationModel model = ALS.train(ratings.rdd(), rank, iterations, 0.01, blocks);

    model.userFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile(
        outputDir + "/userFeatures");
    model.productFeatures().toJavaRDD().map(new FeaturesToString()).saveAsTextFile(
        outputDir + "/productFeatures");
    System.out.println("Final user/product features written to " + outputDir);

    sc.stop();
  }

Example 10

Source File: ActionRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 循环
 *
 * @since hui_project 1.0.0
 */
public void testForEach(){
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    stringJavaRDD.foreach(x->{
        System.out.println(x);
    });
}

Example 11

Source File: ActionRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 取第一个元素.
 *
 * @since hui_project 1.0.0
 */
public void testFirst() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<String> stringJavaRDD = sparkContext.textFile(FILE_PATH);
    String first = stringJavaRDD.first();
    System.out.println(first);
}

Example 12

Source File: ParallelValidator.java From metadata-qa-marc with GNU General Public License v3.0

5 votes

public static void main(String[] args) throws ParseException {

    final Validator validator = new Validator(args);
    ValidatorParameters params = validator.getParameters();
    validator.setDoPrintInProcessRecord(false);

    logger.info("Input file is " + params.getDetailsFileName());
    SparkConf conf = new SparkConf().setAppName("MarcCompletenessCount");
    JavaSparkContext context = new JavaSparkContext(conf);

    System.err.println(validator.getParameters().formatParameters());

    JavaRDD<String> inputFile = context.textFile(validator.getParameters().getArgs()[0]);

    JavaRDD<String> baseCountsRDD = inputFile
      .flatMap(content -> {
        MarcReader reader = ReadMarc.getMarcStringReader(content);
        Record marc4jRecord = reader.next();
        MarcRecord marcRecord = MarcFactory.createFromMarc4j(
          marc4jRecord, params.getDefaultRecordType(), params.getMarcVersion(), params.fixAlephseq());
        validator.processRecord(marcRecord, 1);
        return ValidationErrorFormatter
          .formatForSummary(marcRecord.getValidationErrors(), params.getFormat())
          .iterator();
      }
    );
    baseCountsRDD.saveAsTextFile(validator.getParameters().getDetailsFileName());
  }

Example 13

Source File: SparkSessionRollup.java From aerospike-hadoop with Apache License 2.0

5 votes

public static void main(String[] args) {
    com.aerospike.client.Log.setCallback(new AerospikeLogger());
    com.aerospike.client.Log.setLevel(com.aerospike.client.Log.Level.DEBUG);
    
    SparkConf conf = new SparkConf()
        .setAppName(appName)
        .set("spark.executor.memory", "2g")
        .setMaster(master);
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.addJar("build/libs/spark_session_rollup.jar");

    JavaRDD<String> entries = sc.textFile("hdfs://localhost:54310/tmp/input");

    JavaPairRDD<Long, Iterable<Long>> userhits =
        entries.mapToPair(new ExtractHits()).groupByKey();

    JavaPairRDD<String, Session> sessions =
        userhits.flatMapToPair(new FindSessions());

    System.err.println(sessions.count());

    JobConf job = new JobConf();
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(Session.class);
    job.setOutputFormat(SessionOutputFormat.class);

    AerospikeConfigUtil.setOutputHost(job, "localhost");
    AerospikeConfigUtil.setOutputPort(job, 3000);
    AerospikeConfigUtil.setOutputNamespace(job, "test");
    AerospikeConfigUtil.setOutputSetName(job, "sessions3");

    sessions.saveAsHadoopDataset(job);
}

Example 14

Source File: PageRankSpark.java From graphify with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("Graphify");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);


    JavaRDD<String> lines = ctx.textFile(args[0], 1);


    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
        String[] parts = SPACES.split(s);
        return new Tuple2<>(parts[0], parts[1]);
    }).distinct().groupByKey().cache();


    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(s -> {
                    int urlCount = Iterables.size(s._1());
                    List<Tuple2<String, Double>> results = new ArrayList<>();
                    for (String n : s._1()) {
                        results.add(new Tuple2<>(n, s._2() / urlCount));
                    }
                    return results;
                });
        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?,?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }
    ctx.stop();
}

Example 15

Source File: JavaLinearRegressionWithSGDExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample");
  JavaSparkContext sc = new JavaSparkContext(conf);

  // $example on$
  // Load and parse the data
  String path = "data/mllib/ridge-data/lpsa.data";
  JavaRDD<String> data = sc.textFile(path);
  JavaRDD<LabeledPoint> parsedData = data.map(
    new Function<String, LabeledPoint>() {
      public LabeledPoint call(String line) {
        String[] parts = line.split(",");
        String[] features = parts[1].split(" ");
        double[] v = new double[features.length];
        for (int i = 0; i < features.length - 1; i++) {
          v[i] = Double.parseDouble(features[i]);
        }
        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
      }
    }
  );
  parsedData.cache();

  // Building the model
  int numIterations = 100;
  double stepSize = 0.00000001;
  final LinearRegressionModel model =
    LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);

  // Evaluate model on training examples and compute training error
  JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
    new Function<LabeledPoint, Tuple2<Double, Double>>() {
      public Tuple2<Double, Double> call(LabeledPoint point) {
        double prediction = model.predict(point.features());
        return new Tuple2<>(prediction, point.label());
      }
    }
  );
  double MSE = new JavaDoubleRDD(valuesAndPreds.map(
    new Function<Tuple2<Double, Double>, Object>() {
      public Object call(Tuple2<Double, Double> pair) {
        return Math.pow(pair._1() - pair._2(), 2.0);
      }
    }
  ).rdd()).mean();
  System.out.println("training Mean Squared Error = " + MSE);

  // Save and load model
  model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel");
  LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
    "target/tmp/javaLinearRegressionWithSGDModel");
  // $example off$

  sc.stop();
}

Example 16

Source File: BoxClient.java From render with GNU General Public License v2.0

4 votes

/**
 * Look for prior run data and clean-up potentially corrupted images from the most recent failed prior run.
 * Leave as much existing data as possible in place so that it does not need to be regenerated.
 *
 * @param  sparkContext  context for current run.
 *
 * @return true if prior data was found; otherwise false.
 */
private boolean cleanUpPriorRun(final JavaSparkContext sparkContext) {

    List<String> removedBoxPaths = new ArrayList<>();

    JavaRDD<String> priorRunBoxDataStringsRdd = null;

    final File levelZeroDirectory = new File(boxGenerator.getBaseBoxPath(), "0");
    if (levelZeroDirectory.exists() && boxDataParentDirectory.exists()) {

        final FilenameFilter numberedDirFilter = (dir, name) -> name.matches("^\\d++$");
        final File[] zDirectories = levelZeroDirectory.listFiles(numberedDirFilter);
        if ((zDirectories != null) && (zDirectories.length > 0)) {

            LOG.info("cleanUpPriorRun: found materialized data in {}", levelZeroDirectory);

            // at least one z directory exists, so look for and load partition data from the last run
            final List<File> partitionDirectories =
                    Arrays.asList(Objects.requireNonNull(boxDataParentDirectory.listFiles(File::isDirectory)));

            // reverse sort the list so that the last run is first
            partitionDirectories.sort((o1, o2) -> o2.getName().compareTo(o1.getName()));

            if (partitionDirectories.size() > 0) {
                final File latestPartitionDirectory = partitionDirectories.get(0);
                LOG.info("cleanUpPriorRun: found prior run partition directory {}", latestPartitionDirectory);
                priorRunBoxDataStringsRdd = sparkContext.textFile(latestPartitionDirectory.getAbsolutePath());
            }

        } else {
            LOG.warn("cleanUpPriorRun: skipping because no materialized data was found in {}",
                     levelZeroDirectory);
        }

    } else {
        LOG.warn("cleanUpPriorRun: skipping because {} and/or {} are missing",
                 levelZeroDirectory, boxDataParentDirectory);
    }

    if (priorRunBoxDataStringsRdd != null) {

        final String baseBoxPath = boxGenerator.getBaseBoxPath();
        final String pathSuffix = boxGenerator.getBoxPathSuffix();

        final JavaRDD<String> removedBoxPathsRdd = priorRunBoxDataStringsRdd.mapPartitions(

                (FlatMapFunction<Iterator<String>, String>) stringIterator -> {

                    final List<String> removedPaths = new ArrayList<>();

                    BoxData lastBoxData = null;
                    BoxData boxData;
                    File boxFile;
                    while (stringIterator.hasNext()) {
                        boxData = BoxData.fromString(stringIterator.next());
                        boxFile = boxData.getAbsoluteLevelFile(baseBoxPath, pathSuffix);
                        if (boxFile.exists()) {
                            lastBoxData = boxData;
                        } else {
                            break;
                        }
                    }

                    if (lastBoxData != null) {
                        removeBoxFileAndParentFiles(lastBoxData,
                                                    baseBoxPath,
                                                    pathSuffix,
                                                    removedPaths,
                                                    parameters.box.maxLevel);
                    }

                    return removedPaths.iterator();
                }
        );

        removedBoxPaths = new ArrayList<>(removedBoxPathsRdd.collect());
        Collections.sort(removedBoxPaths);

        LOG.info(""); // empty statement adds newline to lengthy unterminated stage progress lines in log
        LOG.info("cleanUpPriorRun: removed {} box images: {}", removedBoxPaths.size(), removedBoxPaths);
    }

    return (removedBoxPaths.size() > 0);
}

Example 17

Source File: RDD2DataFrameReflection.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
	JavaSparkContext sc = SparkUtils.getLocalSparkContext(RDD2DataFrameReflection.class);

	SQLContext sqlContext = new SQLContext(sc);

	JavaRDD<String> lineRDD = sc.textFile(Constant.LOCAL_FILE_PREX +"/data/resources/people.txt");

	JavaRDD<Row> rowsRDD = lineRDD.map(new Function<String, Row>() {

		@Override
		public Row call(String line) throws Exception {
			String[] lineSplited = line.split(",");

			return RowFactory.create(lineSplited[0], Integer.valueOf(lineSplited[1]));
		}
	});

	// 动态构造元数据,这里用的动态创建元数据
	// 如果不确定有哪些列，这些列需要从数据库或配置文件中加载出来!!!!
	List<StructField> fields = new ArrayList<StructField>();
	fields.add(DataTypes.createStructField("name", DataTypes.StringType, true));
	fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true));

	StructType schema = DataTypes.createStructType(fields);

	// 根据表数据和元数据schema创建临时表
	// Spark2.0之后，DataFrame和DataSet合并为更高级的DataSet，新的DataSet具有两个不同的API特性：
	// 1.非强类型(untyped)，DataSet[Row]是泛型对象的集合，它的别名是DataFrame；
	// 2.强类型(strongly-typed)，DataSet[T]是具体对象的集合，如scala和java中定义的类
	Dataset<Row> dataset = sqlContext.createDataFrame(rowsRDD, schema);
	dataset.registerTempTable("person");

	Dataset<Row> personDataSet = sqlContext.sql("select * from person");

	List<Row> list = personDataSet.javaRDD().collect();

	// 一行记录
	for (Row r : list) {
		System.out.println(r);
	}

	sc.close();
}

Example 18

Source File: WordCountJava.java From BigDataArchitect with Apache License 2.0

4 votes

public static void main(String[] args) throws FileNotFoundException {

        SparkConf conf = new SparkConf();
        conf.setAppName("java-wordcount");
        conf.setMaster("local");

        JavaSparkContext jsc = new JavaSparkContext(conf);

        JavaRDD<String> fileRDD = jsc.textFile("bigdata-spark/data/testdata.txt");

        JavaRDD<String> words = fileRDD.flatMap(new FlatMapFunction<String, String>() {
            public Iterator<String> call(String line) throws Exception {
                return Arrays.asList(line.split(" ")).iterator();
            }
        });

        JavaPairRDD<String, Integer> pairWord = words.mapToPair(new PairFunction<String, String, Integer>() {
            public Tuple2<String, Integer> call(String word) throws Exception {
                return new Tuple2<String, Integer>(word, 1);
            }
        });

        JavaPairRDD<String, Integer> res = pairWord.reduceByKey(new Function2<Integer, Integer, Integer>() {
            public Integer call(Integer oldV, Integer v) throws Exception {
                return oldV + v;
            }
        });

        res.foreach(new VoidFunction<Tuple2<String, Integer>>() {
            public void call(Tuple2<String, Integer> value) throws Exception {
                System.out.println(value._1+"\t"+value._2);
            }
        });

//
//        RandomAccessFile rfile = new RandomAccessFile("ooxx","rw");
//
////        rfile.seek(222);
//        FileChannel channel = rfile.getChannel();
//        //  linux  fd   write(fd)  read(fd)
//
//
//        ByteBuffer b1 = ByteBuffer.allocate(1024);
//        ByteBuffer b2 = ByteBuffer.allocateDirect(1024);
//        MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_WRITE, 80, 120);
//


    }

Example 19

Source File: SaprkFile.java From sparkResearch with Apache License 2.0

4 votes

public static void textFile(JavaSparkContext sparkContext) {

        //文本文件的读写
        JavaRDD<String> rdd = sparkContext.textFile("url");
        rdd.saveAsTextFile("url");
    }