Java Code Examples for org.apache.spark.api.java.JavaSparkContext#parallelize()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#parallelize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: JavaAssociationRulesExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaAssociationRulesExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // $example on$
    JavaRDD<FreqItemset<String>> freqItemsets = sc.parallelize(Arrays.asList(
      new FreqItemset<String>(new String[] {"a"}, 15L),
      new FreqItemset<String>(new String[] {"b"}, 35L),
      new FreqItemset<String>(new String[] {"a", "b"}, 12L)
    ));

    AssociationRules arules = new AssociationRules()
      .setMinConfidence(0.8);
    JavaRDD<AssociationRules.Rule<String>> results = arules.run(freqItemsets);

    for (AssociationRules.Rule<String> rule : results.collect()) {
      System.out.println(
        rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
    }
    // $example off$

    sc.stop();
  }

Example 2

Source File: InterleaveMulti.java From ViraPipe with MIT License

6 votes

private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}

Example 3

Source File: JavaKernelDensityEstimationExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // an RDD of sample data
    JavaRDD<Double> data = jsc.parallelize(
      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));

    // Construct the density estimator with the sample data
    // and a standard deviation for the Gaussian kernels
    KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);

    // Find density estimates for the given values
    double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});

    System.out.println(Arrays.toString(densities));
    // $example off$

    jsc.stop();
  }

Example 4

Source File: Union.java From SparkDemo with MIT License

6 votes

static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD，不去重，要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}

Example 5

Source File: ReadsSparkSourceUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Loads Reads using samReaderFactory, then calling ctx.parallelize.
 * @param bam file to load
 * @return RDD of (SAMRecord-backed) GATKReads from the file.
 */
public JavaRDD<GATKRead> getSerialReads(final JavaSparkContext ctx, final String bam, final GATKPath referencePath, final ValidationStringency validationStringency) {
    final SAMFileHeader readsHeader = new ReadsSparkSource(ctx, validationStringency).getHeader(new GATKPath(bam), referencePath);

    final SamReaderFactory samReaderFactory;
    if (referencePath != null) {
        samReaderFactory = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(referencePath.toPath());
    } else {
        samReaderFactory = SamReaderFactory.makeDefault().validationStringency(validationStringency);
    }

    ReadsDataSource bam2 = new ReadsPathDataSource(IOUtils.getPath(bam), samReaderFactory);
    List<GATKRead> records = Lists.newArrayList();
    for ( GATKRead read : bam2 ) {
        records.add(read);
    }
    return ctx.parallelize(records);
}

Example 6

Source File: JavaPrefixSpanExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaPrefixSpanExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // $example on$
    JavaRDD<List<List<Integer>>> sequences = sc.parallelize(Arrays.asList(
      Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)),
      Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)),
      Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)),
      Arrays.asList(Arrays.asList(6))
    ), 2);
    PrefixSpan prefixSpan = new PrefixSpan()
      .setMinSupport(0.5)
      .setMaxPatternLength(5);
    PrefixSpanModel<Integer> model = prefixSpan.run(sequences);
    for (PrefixSpan.FreqSequence<Integer> freqSeq: model.freqSequences().toJavaRDD().collect()) {
      System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq());
    }
    // $example off$

    sc.stop();
  }

Example 7

Source File: SparkConverter.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}

Example 8

Source File: DependencyParser.java From vn.vitk with GNU General Public License v3.0

5 votes

/**
 * Evaluates a parser on a manually parsed corpus.
 * @param jsc
 * @param graphs
 * @return a list of dependency graphs.
 */
public void evaluate(JavaSparkContext jsc, List<DependencyGraph> graphs) {
	List<Sentence> sentences = new LinkedList<Sentence>();
	for (DependencyGraph graph : graphs) {
		sentences.add(graph.getSentence());
	}
	JavaRDD<Sentence> jrdd = jsc.parallelize(sentences);
	List<DependencyGraph> proposedGraphs = jrdd.map(new ParsingFunction()).collect();
	Evaluator evaluator = new Evaluator();
	evaluator.evaluate(graphs, proposedGraphs);
	System.out.println("UAS(token) = " + evaluator.getUASToken());
	System.out.println("UAS(sentence) = " + evaluator.getUASSentence());
	System.out.println("LAS(token) = " + evaluator.getLASToken());
	System.out.println("LAS(sentence) = " + evaluator.getLASSentence());
}

Example 9

Source File: PiComputeLambdaWithRddApp.java From net.jgp.labs.spark with Apache License 2.0

5 votes

/**
 * The processing code.
 */
private void start(int slices) {
  SparkSession spark = SparkSession
      .builder()
      .appName("JavaSparkPi")
      .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  int n = 100000 * slices;
  List<Integer> l = new ArrayList<>(n);
  for (int i = 0; i < n; i++) {
    l.add(i);
  }

  JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);

  int count = dataSet.map(integer -> {
    double x = Math.random() * 2 - 1;
    double y = Math.random() * 2 - 1;
    return (x * x + y * y <= 1) ? 1 : 0;
  }).reduce((integer, integer2) -> integer + integer2);

  System.out.println("Pi is roughly " + 4.0 * count / n);

  spark.stop();
}

Example 10

Source File: TestLineRecordReaderFunction.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testLineRecordReader() throws Exception {

    File dataFile = new ClassPathResource("iris.dat").getFile();
    List<String> lines = FileUtils.readLines(dataFile);

    JavaSparkContext sc = getContext();
    JavaRDD<String> linesRdd = sc.parallelize(lines);

    CSVRecordReader rr = new CSVRecordReader(0, ',');

    JavaRDD<List<Writable>> out = linesRdd.map(new LineRecordReaderFunction(rr));
    List<List<Writable>> outList = out.collect();


    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(dataFile));
    Set<List<Writable>> expectedSet = new HashSet<>();
    int totalCount = 0;
    while (rr2.hasNext()) {
        expectedSet.add(rr2.next());
        totalCount++;
    }

    assertEquals(totalCount, outList.size());

    for (List<Writable> line : outList) {
        assertTrue(expectedSet.contains(line));
    }
}

Example 11

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的：算平方。（单元测试比较难看出来分区作用）
 *
 * @since hui_project 1.0.0
 */
public void testMapPartitions() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize
            .mapPartitions(x -> getSquare(x));
    checkResult(rdd.collect());

}

Example 12

Source File: TransformationRDD.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的：算平方。（参数1是分区的索引）
 *
 * @since hui_project 1.0.0
 */
public void testMapPartitionsWithIndex() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize.mapPartitionsWithIndex((x, y) -> getSquareWithIndex(x, y), false);
    checkResult(rdd.collect());
}

Example 13

Source File: TestLineRecordReaderFunction.java From DataVec with Apache License 2.0

5 votes

@Test
public void testLineRecordReader() throws Exception {

    File dataFile = new ClassPathResource("iris.dat").getFile();
    List<String> lines = FileUtils.readLines(dataFile);

    JavaSparkContext sc = getContext();
    JavaRDD<String> linesRdd = sc.parallelize(lines);

    CSVRecordReader rr = new CSVRecordReader(0, ',');

    JavaRDD<List<Writable>> out = linesRdd.map(new LineRecordReaderFunction(rr));
    List<List<Writable>> outList = out.collect();


    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(dataFile));
    Set<List<Writable>> expectedSet = new HashSet<>();
    int totalCount = 0;
    while (rr2.hasNext()) {
        expectedSet.add(rr2.next());
        totalCount++;
    }

    assertEquals(totalCount, outList.size());

    for (List<Writable> line : outList) {
        assertTrue(expectedSet.contains(line));
    }
}

Example 14

Source File: CoveragePoNQCUtils.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 *  Split a read count collection into a separate read count collection for each sample.
 *
 * @param readCountCollection -- input read count collection assumed to have 1+ samples
 * @param ctx Use {@code null} if no spark context is available.  Result serialization is the big bottleneck, so it may be worthwhile to skip spark with fewer available cores
 * @return not {@code null}
 */
@VisibleForTesting
static JavaRDD<ReadCountCollection> createParallelIndividualReadCountCollections(final ReadCountCollection readCountCollection, final JavaSparkContext ctx) {
    final List<String> sampleNames = readCountCollection.columnNames();
    Broadcast<ReadCountCollection> broadcastedReadCountCollection = ctx.broadcast(readCountCollection);
    JavaRDD<String> parallelSampleNames = ctx.parallelize(sampleNames, Math.max(sampleNames.size()/10, 4));
    return parallelSampleNames.map(s -> broadcastedReadCountCollection.value().subsetColumns(Collections.singleton(s)));
}

Example 15

Source File: MarkDuplicatesSparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(expectedExceptions = UserException.BadInput.class)
public void testHeaderMissingReadGroupFilds() {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname,
            true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY);

    JavaRDD<GATKRead> reads = ctx.parallelize(new ArrayList<>(), 2);
    SAMFileHeader header = samRecordSetBuilder.getHeader();
    header.setReadGroups(new ArrayList<>());

    MarkDuplicatesSparkUtils.transformToDuplicateNames(header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, null, reads, 2, false).collect();
}

Example 16

Source File: TestCompareParameterAveragingSparkVsSingleMachine.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void testOneExecutorGraph() {
    //Idea: single worker/executor on Spark should give identical results to a single machine

    int miniBatchSize = 10;
    int nWorkers = 1;

    for (boolean saveUpdater : new boolean[] {true, false}) {
        JavaSparkContext sc = getContext(nWorkers);

        try {
            //Do training locally, for 3 minibatches
            int[] seeds = {1, 2, 3};

            ComputationGraph net = new ComputationGraph(getGraphConf(12345, new RmsProp(0.5)));
            net.init();
            INDArray initialParams = net.params().dup();

            for (int i = 0; i < seeds.length; i++) {
                DataSet ds = getOneDataSet(miniBatchSize, seeds[i]);
                if (!saveUpdater)
                    net.setUpdater(null);
                net.fit(ds);
            }
            INDArray finalParams = net.params().dup();

            //Do training on Spark with one executor, for 3 separate minibatches
            TrainingMaster tm = getTrainingMaster(1, miniBatchSize, saveUpdater);
            SparkComputationGraph sparkNet =
                            new SparkComputationGraph(sc, getGraphConf(12345, new RmsProp(0.5)), tm);
            sparkNet.setCollectTrainingStats(true);
            INDArray initialSparkParams = sparkNet.getNetwork().params().dup();

            for (int i = 0; i < seeds.length; i++) {
                List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSize, seeds[i]);
                JavaRDD<DataSet> rdd = sc.parallelize(list);

                sparkNet.fit(rdd);
            }

            INDArray finalSparkParams = sparkNet.getNetwork().params().dup();

            assertEquals(initialParams, initialSparkParams);
            assertNotEquals(initialParams, finalParams);
            assertEquals(finalParams, finalSparkParams);
        } finally {
            sc.stop();
        }
    }
}

Example 17

Source File: PropertyGraphExampleFromEdges.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf = new SparkConf().setMaster("local").setAppName("graph");
		JavaSparkContext javaSparkContext = new JavaSparkContext(conf);
		ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);


		List<Edge<String>> edges = new ArrayList<>();

		edges.add(new Edge<String>(1, 2, "Friend"));
		edges.add(new Edge<String>(2, 3, "Advisor"));
		edges.add(new Edge<String>(1, 3, "Friend"));
		edges.add(new Edge<String>(4, 3, "colleague"));
		edges.add(new Edge<String>(4, 5, "Relative"));
		edges.add(new Edge<String>(2, 5, "BusinessPartners"));


		JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(edges);
		
		
		Graph<String, String> graph = Graph.fromEdges(edgeRDD.rdd(), "",StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), stringTag, stringTag);
		
		
		graph.vertices().toJavaRDD().collect().forEach(System.out::println);
		
		
		
//	graph.aggregateMessages(sendMsg, mergeMsg, tripletFields, evidence$11)	
		
	}

Example 18

Source File: BayesianNetworkSampler.java From toolbox with Apache License 2.0

4 votes

public DataSpark sampleToDataSpark(JavaSparkContext sc, int nSamples, int parallelism) {

            int localNSamples = nSamples/parallelism;

            JavaRDD<Integer> partitions = sc.parallelize(Arrays.asList(new Integer[parallelism]), parallelism);

            Function2 getPartitionSample = new Function2<Integer, Iterator<Integer>, Iterator<DataInstance>>(){
                @Override
                public Iterator<DataInstance> call(Integer ind, Iterator<Integer> iterator) throws Exception {
                    localSampler.setSeed(seed+ind);
                    return localSampler.sampleToDataStream(localNSamples).iterator();
                }
            };

            JavaRDD<DataInstance> sampleRDD = partitions.mapPartitionsWithIndex(getPartitionSample, false);

            // Get the attributes from a local instance
            Attributes attributes = this.localSampler.sampleToDataStream(1).getAttributes();

            return new DataSparkFromRDD(sampleRDD, attributes);

        }

Example 19

Source File: ReducedRedundancyLocatorExampleMain.java From s3-inventory-usage-examples with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}

Example 20

Source File: FhirEncodersTest.java From bunsen with Apache License 2.0

3 votes

@Test
public void testFromRdd() {

  JavaSparkContext context = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Condition> conditionRdd = context.parallelize(ImmutableList.of(condition));

  Dataset<Condition> ds = spark.createDataset(conditionRdd.rdd(),
      encoders.of(Condition.class));

  Condition convertedCondition = ds.head();

  Assert.assertEquals(condition.getId(),
      convertedCondition.getId());
}