Java Code Examples for org.apache.spark.api.java.JavaSparkContext#parallelize()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#parallelize() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JavaAssociationRulesExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaAssociationRulesExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // $example on$
    JavaRDD<FreqItemset<String>> freqItemsets = sc.parallelize(Arrays.asList(
      new FreqItemset<String>(new String[] {"a"}, 15L),
      new FreqItemset<String>(new String[] {"b"}, 35L),
      new FreqItemset<String>(new String[] {"a", "b"}, 12L)
    ));

    AssociationRules arules = new AssociationRules()
      .setMinConfidence(0.8);
    JavaRDD<AssociationRules.Rule<String>> results = arules.run(freqItemsets);

    for (AssociationRules.Rule<String> rule : results.collect()) {
      System.out.println(
        rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence());
    }
    // $example off$

    sc.stop();
  }
 
Example 2
Source File: InterleaveMulti.java    From ViraPipe with MIT License 6 votes vote down vote up
private static void splitFastq(FileStatus fst, String fqPath, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {
  Path fqpath = new Path(fqPath);
  String fqname = fqpath.getName();
  String[] ns = fqname.split("\\.");
  //TODO: Handle also compressed files
  List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);

  JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);

  splitRDD.foreach( split ->  {

    FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), split);
    writeFastqFile(fqreader, new Configuration(), splitDir + "/split_" + split.getStart() + "." + ns[1]);

   });
}
 
Example 3
Source File: JavaKernelDensityEstimationExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // an RDD of sample data
    JavaRDD<Double> data = jsc.parallelize(
      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));

    // Construct the density estimator with the sample data
    // and a standard deviation for the Gaussian kernels
    KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);

    // Find density estimates for the given values
    double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});

    System.out.println(Arrays.toString(densities));
    // $example off$

    jsc.stop();
  }
 
Example 4
Source File: Union.java    From SparkDemo with MIT License 6 votes vote down vote up
static void union(JavaSparkContext sc ) {
    List<String> datas1 = Arrays.asList("张三", "李四");
    List<String> datas2 = Arrays.asList("tom", "gim");

    JavaRDD<String> data1RDD = sc.parallelize(datas1);
    JavaRDD<String> data2RDD = sc.parallelize(datas2);

    /**
	 *  ====================================================================
	 *   |             合并两个RDD,不去重,要求两个RDD中的元素类型一致                                                                            |
	 *   |             Merge two RDD, -not heavy, and require the consistency of the element types in the two RDD |                                                                                                                                                                                                                                    | 
	 *   ====================================================================
	 */
    JavaRDD<String> unionRDD = data1RDD
            .union(data2RDD);

    unionRDD.foreach(new VoidFunction<String>() {
		@Override
		public void call(String t) throws Exception {
			System.out.println(t);
		}
	});

    sc.close();
}
 
Example 5
Source File: ReadsSparkSourceUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Loads Reads using samReaderFactory, then calling ctx.parallelize.
 * @param bam file to load
 * @return RDD of (SAMRecord-backed) GATKReads from the file.
 */
public JavaRDD<GATKRead> getSerialReads(final JavaSparkContext ctx, final String bam, final GATKPath referencePath, final ValidationStringency validationStringency) {
    final SAMFileHeader readsHeader = new ReadsSparkSource(ctx, validationStringency).getHeader(new GATKPath(bam), referencePath);

    final SamReaderFactory samReaderFactory;
    if (referencePath != null) {
        samReaderFactory = SamReaderFactory.makeDefault().validationStringency(validationStringency).referenceSequence(referencePath.toPath());
    } else {
        samReaderFactory = SamReaderFactory.makeDefault().validationStringency(validationStringency);
    }

    ReadsDataSource bam2 = new ReadsPathDataSource(IOUtils.getPath(bam), samReaderFactory);
    List<GATKRead> records = Lists.newArrayList();
    for ( GATKRead read : bam2 ) {
        records.add(read);
    }
    return ctx.parallelize(records);
}
 
Example 6
Source File: JavaPrefixSpanExample.java    From SparkDemo with MIT License 6 votes vote down vote up
public static void main(String[] args) {

    SparkConf sparkConf = new SparkConf().setAppName("JavaPrefixSpanExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // $example on$
    JavaRDD<List<List<Integer>>> sequences = sc.parallelize(Arrays.asList(
      Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3)),
      Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1, 2)),
      Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5)),
      Arrays.asList(Arrays.asList(6))
    ), 2);
    PrefixSpan prefixSpan = new PrefixSpan()
      .setMinSupport(0.5)
      .setMaxPatternLength(5);
    PrefixSpanModel<Integer> model = prefixSpan.run(sequences);
    for (PrefixSpan.FreqSequence<Integer> freqSeq: model.freqSequences().toJavaRDD().collect()) {
      System.out.println(freqSeq.javaSequence() + ", " + freqSeq.freq());
    }
    // $example off$

    sc.stop();
  }
 
Example 7
Source File: SparkConverter.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Create a distributed matrix given an Apache Commons RealMatrix.
 *
 * @param sc Never {@code null}
 * @param realMat Apache Commons RealMatrix.  Never {@code null}
 * @return A distributed Spark matrix
 */
public static RowMatrix convertRealMatrixToSparkRowMatrix(JavaSparkContext sc, RealMatrix realMat, int numSlices) {
    logger.info("Converting matrix to distributed Spark matrix...");
    final double [][] dataArray = realMat.getData();
    final LinkedList<Vector> rowsList = new LinkedList<>();
    for (final double [] i : dataArray) {
        final Vector currentRow = Vectors.dense(i);
        rowsList.add(currentRow);
    }

    // We may want to swap out this static value for something dynamic (as shown below), but this seems to slow it down.
    // final int totalSpace = realMat.getColumnDimension() * realMat.getRowDimension() * Double.BYTES;
    // // Want the partitions to be ~100KB of space
    // final int slices = totalSpace/100000;
    final JavaRDD<Vector> rows = sc.parallelize(rowsList, numSlices);

    // Create a RowMatrix from JavaRDD<Vector>.
    final RowMatrix mat = new RowMatrix(rows.rdd());
    logger.info("Done converting matrix to distributed Spark matrix...");
    return mat;
}
 
Example 8
Source File: DependencyParser.java    From vn.vitk with GNU General Public License v3.0 5 votes vote down vote up
/**
 * Evaluates a parser on a manually parsed corpus.
 * @param jsc
 * @param graphs
 * @return a list of dependency graphs.
 */
public void evaluate(JavaSparkContext jsc, List<DependencyGraph> graphs) {
	List<Sentence> sentences = new LinkedList<Sentence>();
	for (DependencyGraph graph : graphs) {
		sentences.add(graph.getSentence());
	}
	JavaRDD<Sentence> jrdd = jsc.parallelize(sentences);
	List<DependencyGraph> proposedGraphs = jrdd.map(new ParsingFunction()).collect();
	Evaluator evaluator = new Evaluator();
	evaluator.evaluate(graphs, proposedGraphs);
	System.out.println("UAS(token) = " + evaluator.getUASToken());
	System.out.println("UAS(sentence) = " + evaluator.getUASSentence());
	System.out.println("LAS(token) = " + evaluator.getLASToken());
	System.out.println("LAS(sentence) = " + evaluator.getLASSentence());
}
 
Example 9
Source File: PiComputeLambdaWithRddApp.java    From net.jgp.labs.spark with Apache License 2.0 5 votes vote down vote up
/**
 * The processing code.
 */
private void start(int slices) {
  SparkSession spark = SparkSession
      .builder()
      .appName("JavaSparkPi")
      .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  int n = 100000 * slices;
  List<Integer> l = new ArrayList<>(n);
  for (int i = 0; i < n; i++) {
    l.add(i);
  }

  JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);

  int count = dataSet.map(integer -> {
    double x = Math.random() * 2 - 1;
    double y = Math.random() * 2 - 1;
    return (x * x + y * y <= 1) ? 1 : 0;
  }).reduce((integer, integer2) -> integer + integer2);

  System.out.println("Pi is roughly " + 4.0 * count / n);

  spark.stop();
}
 
Example 10
Source File: TestLineRecordReaderFunction.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineRecordReader() throws Exception {

    File dataFile = new ClassPathResource("iris.dat").getFile();
    List<String> lines = FileUtils.readLines(dataFile);

    JavaSparkContext sc = getContext();
    JavaRDD<String> linesRdd = sc.parallelize(lines);

    CSVRecordReader rr = new CSVRecordReader(0, ',');

    JavaRDD<List<Writable>> out = linesRdd.map(new LineRecordReaderFunction(rr));
    List<List<Writable>> outList = out.collect();


    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(dataFile));
    Set<List<Writable>> expectedSet = new HashSet<>();
    int totalCount = 0;
    while (rr2.hasNext()) {
        expectedSet.add(rr2.next());
        totalCount++;
    }

    assertEquals(totalCount, outList.size());

    for (List<Writable> line : outList) {
        assertTrue(expectedSet.contains(line));
    }
}
 
Example 11
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的:算平方。(单元测试比较难看出来分区作用)
 *
 * @since hui_project 1.0.0
 */
public void testMapPartitions() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize
            .mapPartitions(x -> getSquare(x));
    checkResult(rdd.collect());

}
 
Example 12
Source File: TransformationRDD.java    From hui-bigdata-spark with Apache License 2.0 5 votes vote down vote up
/**
 * 元素转换,在每一个分区内部进行元素转换.
 * demo计算目的:算平方。(参数1是分区的索引)
 *
 * @since hui_project 1.0.0
 */
public void testMapPartitionsWithIndex() {
    SparkConf sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    JavaRDD<Integer> parallelize = sparkContext.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
    JavaRDD<Tuple2<Integer, Integer>> rdd = parallelize.mapPartitionsWithIndex((x, y) -> getSquareWithIndex(x, y), false);
    checkResult(rdd.collect());
}
 
Example 13
Source File: TestLineRecordReaderFunction.java    From DataVec with Apache License 2.0 5 votes vote down vote up
@Test
public void testLineRecordReader() throws Exception {

    File dataFile = new ClassPathResource("iris.dat").getFile();
    List<String> lines = FileUtils.readLines(dataFile);

    JavaSparkContext sc = getContext();
    JavaRDD<String> linesRdd = sc.parallelize(lines);

    CSVRecordReader rr = new CSVRecordReader(0, ',');

    JavaRDD<List<Writable>> out = linesRdd.map(new LineRecordReaderFunction(rr));
    List<List<Writable>> outList = out.collect();


    CSVRecordReader rr2 = new CSVRecordReader(0, ',');
    rr2.initialize(new FileSplit(dataFile));
    Set<List<Writable>> expectedSet = new HashSet<>();
    int totalCount = 0;
    while (rr2.hasNext()) {
        expectedSet.add(rr2.next());
        totalCount++;
    }

    assertEquals(totalCount, outList.size());

    for (List<Writable> line : outList) {
        assertTrue(expectedSet.contains(line));
    }
}
 
Example 14
Source File: CoveragePoNQCUtils.java    From gatk-protected with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 *  Split a read count collection into a separate read count collection for each sample.
 *
 * @param readCountCollection -- input read count collection assumed to have 1+ samples
 * @param ctx Use {@code null} if no spark context is available.  Result serialization is the big bottleneck, so it may be worthwhile to skip spark with fewer available cores
 * @return not {@code null}
 */
@VisibleForTesting
static JavaRDD<ReadCountCollection> createParallelIndividualReadCountCollections(final ReadCountCollection readCountCollection, final JavaSparkContext ctx) {
    final List<String> sampleNames = readCountCollection.columnNames();
    Broadcast<ReadCountCollection> broadcastedReadCountCollection = ctx.broadcast(readCountCollection);
    JavaRDD<String> parallelSampleNames = ctx.parallelize(sampleNames, Math.max(sampleNames.size()/10, 4));
    return parallelSampleNames.map(s -> broadcastedReadCountCollection.value().subsetColumns(Collections.singleton(s)));
}
 
Example 15
Source File: MarkDuplicatesSparkUtilsUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(expectedExceptions = UserException.BadInput.class)
public void testHeaderMissingReadGroupFilds() {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.queryname,
            true, SAMRecordSetBuilder.DEFAULT_CHROMOSOME_LENGTH, SAMRecordSetBuilder.DEFAULT_DUPLICATE_SCORING_STRATEGY);

    JavaRDD<GATKRead> reads = ctx.parallelize(new ArrayList<>(), 2);
    SAMFileHeader header = samRecordSetBuilder.getHeader();
    header.setReadGroups(new ArrayList<>());

    MarkDuplicatesSparkUtils.transformToDuplicateNames(header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, null, reads, 2, false).collect();
}
 
Example 16
Source File: TestCompareParameterAveragingSparkVsSingleMachine.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testOneExecutorGraph() {
    //Idea: single worker/executor on Spark should give identical results to a single machine

    int miniBatchSize = 10;
    int nWorkers = 1;

    for (boolean saveUpdater : new boolean[] {true, false}) {
        JavaSparkContext sc = getContext(nWorkers);

        try {
            //Do training locally, for 3 minibatches
            int[] seeds = {1, 2, 3};

            ComputationGraph net = new ComputationGraph(getGraphConf(12345, new RmsProp(0.5)));
            net.init();
            INDArray initialParams = net.params().dup();

            for (int i = 0; i < seeds.length; i++) {
                DataSet ds = getOneDataSet(miniBatchSize, seeds[i]);
                if (!saveUpdater)
                    net.setUpdater(null);
                net.fit(ds);
            }
            INDArray finalParams = net.params().dup();

            //Do training on Spark with one executor, for 3 separate minibatches
            TrainingMaster tm = getTrainingMaster(1, miniBatchSize, saveUpdater);
            SparkComputationGraph sparkNet =
                            new SparkComputationGraph(sc, getGraphConf(12345, new RmsProp(0.5)), tm);
            sparkNet.setCollectTrainingStats(true);
            INDArray initialSparkParams = sparkNet.getNetwork().params().dup();

            for (int i = 0; i < seeds.length; i++) {
                List<DataSet> list = getOneDataSetAsIndividalExamples(miniBatchSize, seeds[i]);
                JavaRDD<DataSet> rdd = sc.parallelize(list);

                sparkNet.fit(rdd);
            }

            INDArray finalSparkParams = sparkNet.getNetwork().params().dup();

            assertEquals(initialParams, initialSparkParams);
            assertNotEquals(initialParams, finalParams);
            assertEquals(finalParams, finalSparkParams);
        } finally {
            sc.stop();
        }
    }
}
 
Example 17
Source File: PropertyGraphExampleFromEdges.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf = new SparkConf().setMaster("local").setAppName("graph");
		JavaSparkContext javaSparkContext = new JavaSparkContext(conf);
		ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);


		List<Edge<String>> edges = new ArrayList<>();

		edges.add(new Edge<String>(1, 2, "Friend"));
		edges.add(new Edge<String>(2, 3, "Advisor"));
		edges.add(new Edge<String>(1, 3, "Friend"));
		edges.add(new Edge<String>(4, 3, "colleague"));
		edges.add(new Edge<String>(4, 5, "Relative"));
		edges.add(new Edge<String>(2, 5, "BusinessPartners"));


		JavaRDD<Edge<String>> edgeRDD = javaSparkContext.parallelize(edges);
		
		
		Graph<String, String> graph = Graph.fromEdges(edgeRDD.rdd(), "",StorageLevel.MEMORY_ONLY(), StorageLevel.MEMORY_ONLY(), stringTag, stringTag);
		
		
		graph.vertices().toJavaRDD().collect().forEach(System.out::println);
		
		
		
//	graph.aggregateMessages(sendMsg, mergeMsg, tripletFields, evidence$11)	
		
	}
 
Example 18
Source File: BayesianNetworkSampler.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public DataSpark sampleToDataSpark(JavaSparkContext sc, int nSamples, int parallelism) {

            int localNSamples = nSamples/parallelism;

            JavaRDD<Integer> partitions = sc.parallelize(Arrays.asList(new Integer[parallelism]), parallelism);

            Function2 getPartitionSample = new Function2<Integer, Iterator<Integer>, Iterator<DataInstance>>(){
                @Override
                public Iterator<DataInstance> call(Integer ind, Iterator<Integer> iterator) throws Exception {
                    localSampler.setSeed(seed+ind);
                    return localSampler.sampleToDataStream(localNSamples).iterator();
                }
            };

            JavaRDD<DataInstance> sampleRDD = partitions.mapPartitionsWithIndex(getPartitionSample, false);

            // Get the attributes from a local instance
            Attributes attributes = this.localSampler.sampleToDataStream(1).getAttributes();

            return new DataSparkFromRDD(sampleRDD, attributes);

        }
 
Example 19
Source File: ReducedRedundancyLocatorExampleMain.java    From s3-inventory-usage-examples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception{
    String srcBucketName;
    String scrBucketKey;
    String destBucketName;
    String destPrefix;
    ArgumentParser argumentParser = new ArgumentParser();
    AmazonS3 s3Client = new AmazonS3Client();

    try {
        BucketKey location = argumentParser.parseArguments(args);
        srcBucketName = location.getSrcBucket();
        scrBucketKey = location.getSrcKey();
        destBucketName = location.getDestBucket();
        destPrefix = location.getDestPrefix();
    } catch (ParseException e) {
        LOG.info(PARSE_ERROR_MSG);
        throw new IllegalArgumentException("Parser throw a parse Exception", e);
    }

    // Obtain the original manifest files
    InventoryManifestRetriever inventoryManifestRetriever =
            new InventoryManifestRetriever(s3Client, srcBucketName, scrBucketKey);
    InventoryManifest manifest = inventoryManifestRetriever.getInventoryManifest();

    // Check if the inventory report includes the StorageClass column
    String fileSchema = manifest.getFileSchema();
    String filterColumn = "storageClass";
    if (!StringUtils.containsIgnoreCase(fileSchema, filterColumn)) {
        throw new StorageClassNotIncludedException();
    }

    //Create Spark Context
    SparkConf sparkConf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    Broadcast<CachedS3ClientFactory> clientFactory = sc.broadcast(new CachedS3ClientFactory());

    // Get the inventory report, split it into lines, parse each line to a POJO,
    // Filter, and write new csv file to S3
    JavaRDD<InventoryManifest.Locator> locatorRDD = sc.parallelize(manifest.getLocators());
    List<InventoryManifest.Locator> newLocatorList = locatorRDD
            .map(new InventoryReportLineRetriever(clientFactory, manifest))
            .flatMap(new InventoryReportMapper(manifest))
            .filter(new ReducedRedundancyStorageClassFilter())
            .mapPartitions(new WriteNewInventoryReportFunc(clientFactory, srcBucketName, manifest,
                    destBucketName, destPrefix))
            .collect();

    // Generate new manifest files including new locators, and send them back to S3
    new ManifestWriter(s3Client, destBucketName, destPrefix, srcBucketName, manifest)
            .writeManifest(newLocatorList);

    sc.close();
}
 
Example 20
Source File: FhirEncodersTest.java    From bunsen with Apache License 2.0 3 votes vote down vote up
@Test
public void testFromRdd() {

  JavaSparkContext context = new JavaSparkContext(spark.sparkContext());

  JavaRDD<Condition> conditionRdd = context.parallelize(ImmutableList.of(condition));

  Dataset<Condition> ds = spark.createDataset(conditionRdd.rdd(),
      encoders.of(Condition.class));

  Condition convertedCondition = ds.head();

  Assert.assertEquals(condition.getId(),
      convertedCondition.getId());
}