org.apache.spark.api.java.JavaSparkContext Java Examples

The following examples show how to use org.apache.spark.api.java.JavaSparkContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SparkUtils.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * List of the files in the given directory (path), as a {@code JavaRDD<String>}
 *
 * @param sc                Spark context
 * @param path              Path to list files in
 * @param recursive         Whether to walk the directory tree recursively (i.e., include subdirectories)
 * @param allowedExtensions If null: all files will be accepted. If non-null: only files with the specified extension will be allowed.
 *                          Exclude the extension separator - i.e., use "txt" not ".txt" here.
 * @param config            Hadoop configuration to use. Must not be null.
 * @return Paths in the directory
 * @throws IOException If error occurs getting directory contents
 */
public static JavaRDD<String> listPaths(@NonNull JavaSparkContext sc, String path, boolean recursive,
                                        Set<String> allowedExtensions, @NonNull Configuration config) throws IOException {
    List<String> paths = new ArrayList<>();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), recursive);

    while (fileIter.hasNext()) {
        String filePath = fileIter.next().getPath().toString();
        if(allowedExtensions == null){
            paths.add(filePath);
        } else {
            String ext = FilenameUtils.getExtension(path);
            if(allowedExtensions.contains(ext)){
                paths.add(filePath);
            }
        }
    }
    return sc.parallelize(paths);
}

Example #2

Source File: PSFilterTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(groups = "spark")
public void testDoSetPairFlags() {

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final SAMSequenceDictionary seq = new SAMSequenceDictionary();
    seq.addSequence(new SAMSequenceRecord("test_seq", 1000));
    final SAMFileHeader header = new SAMFileHeader(seq);

    final List<GATKRead> readList = makeReadSet(header);
    final JavaRDD<GATKRead> reads = ctx.parallelize(readList);
    ;

    final List<GATKRead> result = PSFilter.setPairFlags(reads, 100).collect();

    Assert.assertEquals(result.size(), 6);
    for (final GATKRead read : result) {
        if (read.getName().equals("paired_1") || read.getName().equals("paired_2")) {
            Assert.assertTrue(read.isPaired());
        } else {
            Assert.assertFalse(read.isPaired());
        }
    }

}

Example #3

Source File: SparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test
public void testPathExists() throws Exception {
    MiniClusterUtils.runOnIsolatedMiniCluster( cluster -> {
        //use the HDFS on the mini cluster
        final Path workingDirectory = MiniClusterUtils.getWorkingDir(cluster);
        final Path tempPath = new Path(workingDirectory, "testFileExists.txt");
        final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

        Assert.assertFalse(SparkUtils.hadoopPathExists(ctx, tempPath.toUri()));
        final FileSystem fs = tempPath.getFileSystem(ctx.hadoopConfiguration());
        final FSDataOutputStream fsOutStream = fs.create(tempPath);
        fsOutStream.close();
        fs.deleteOnExit(tempPath);
        Assert.assertTrue(SparkUtils.hadoopPathExists(ctx, tempPath.toUri()));
    });
}

Example #4

Source File: CassandraDependenciesJob.java From spark-dependencies with Apache License 2.0

6 votes

public void run(String peerServiceTag) {
  long microsLower = day.toInstant().toEpochMilli() * 1000;
  long microsUpper = day.plus(Period.ofDays(1)).toInstant().toEpochMilli() * 1000 - 1;

  log.info("Running Dependencies job for {}: {} ≤ Span.timestamp {}", day, microsLower, microsUpper);
  JavaSparkContext sc = new JavaSparkContext(conf);
  try {
    JavaPairRDD<String, Iterable<Span>> traces = javaFunctions(sc)
        .cassandraTable(keyspace, "traces", mapRowTo(CassandraSpan.class))
        .where("start_time < ? AND start_time > ?", microsUpper, microsLower)
        .mapToPair(span -> new Tuple2<>(span.getTraceId(), span))
        .mapValues(span -> (Span) span)
        .groupByKey();

    List<Dependency> dependencyLinks = DependenciesSparkHelper.derive(traces,peerServiceTag);
    store(sc, dependencyLinks);
    log.info("Done, {} dependency objects created", dependencyLinks.size());
  } finally {
    sc.stop();
  }
}

Example #5

Source File: CoveragePoNQCUtilsUnitTest.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

6 votes

@Test
public void testIdentifySamplesWithSuspiciousContigsDelsWithSpark() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Set<String> gtBlacklistSamples = new HashSet<>();
    gtBlacklistSamples.add("sample_1");
    gtBlacklistSamples.add("sample_2");
    gtBlacklistSamples.add("sample_3");

    ReadCountCollection allCoverageProfiles = null;
    try {
        allCoverageProfiles = ReadCountCollectionUtils.parse(TEST_FILE_DEL);
    } catch (final IOException ioe) {
        Assert.fail("Could not load test file: " + TEST_FILE_DEL, ioe);
    }
    final JavaRDD<ReadCountCollection> allSampleTangentNormalizedReadCounts = CoveragePoNQCUtils.createParallelIndividualReadCountCollections(allCoverageProfiles, ctx);

    // By the time we are here, input is assumed to have been tangent normalized.
    final List<String> blacklistSamples = CoveragePoNQCUtils.identifySamplesWithSuspiciousContigs(allSampleTangentNormalizedReadCounts, ctx, CoveragePoNQCUtils.getContigToMedianCRMap(allCoverageProfiles));

    final Set<String> resultSamples = new HashSet<>(blacklistSamples);

    Assert.assertEquals(resultSamples.size(), gtBlacklistSamples.size());
    Assert.assertEquals(Sets.difference(resultSamples, gtBlacklistSamples).size(), 0);
}

Example #6

Source File: PdbRedoToMmtf.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) {  
	
	if (args.length != 2) {
		System.err.println("Usage: " + PdbRedoToMmtf.class.getSimpleName() + " <pdb-redo-path> <mmtf-path");
		System.exit(1);
	}
    
	long start = System.nanoTime();
	
    // instantiate Spark
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PdbRedoToMmtf.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // import PDB-REDO from a local copy
    JavaPairRDD<String, StructureDataInterface> pdbredo = MmtfImporter.importPdbRedo(args[0], sc);

    // save PDB-REDO as an MMTF-Hadoop Sequence file
    MmtfWriter.writeSequenceFile(args[1], sc, pdbredo);
    
    long end = System.nanoTime();
    
    System.out.println("time: " + (end-start)/1E9 + " sec.");
    
    // close Spark
    sc.close();
}

Example #7

Source File: SparkWordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void main(String[] args) throws Exception {
	System.out.println(System.getProperty("hadoop.home.dir"));
	String inputPath = args[0];
	String outputPath = args[1];
	FileUtils.deleteQuietly(new File(outputPath));

	JavaSparkContext sc = new JavaSparkContext("local", "sparkwordcount");

	JavaRDD<String> rdd = sc.textFile(inputPath);

	JavaPairRDD<String, Integer> counts = rdd
			.flatMap(x -> Arrays.asList(x.split(" ")).iterator())
			.mapToPair(x -> new Tuple2<String, Integer>((String) x, 1))
			.reduceByKey((x, y) -> x + y);

	counts.saveAsTextFile(outputPath);
	sc.close();
}

Example #8

Source File: CpxVariantReInterpreterSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Override
protected void runTool(final JavaSparkContext ctx) {

    // TODO: 5/9/18 getback sample name in output files
    final SAMFileHeader headerForReads = getHeaderForReads();
    final Set<VCFHeaderLine> defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
    final SvDiscoveryInputMetaData svDiscoveryInputMetaData =
            new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile,
                    derivedSimpleVCFPrefix,
                    null, null, null, null,
                    headerForReads, getReference(), defaultToolVCFHeaderLines, localLogger);

    final JavaRDD<VariantContext> complexVariants = new VariantsSparkSource(ctx)
            .getParallelVariantContexts(complexVCF, getIntervals());
    final JavaRDD<GATKRead> assemblyRawAlignments = getReads();

    final SegmentedCpxVariantSimpleVariantExtractor.ExtractedSimpleVariants extract =
            SegmentedCpxVariantSimpleVariantExtractor.extract(complexVariants, svDiscoveryInputMetaData, assemblyRawAlignments);

    final String derivedOneSegmentSimpleVCF = derivedSimpleVCFPrefix + "_1_seg.vcf";
    final String derivedMultiSegmentSimpleVCF = derivedSimpleVCFPrefix + "_multi_seg.vcf";
    final VCFHeader vcfHeader = VariantsSparkSource.getHeader(complexVCF);
    SVVCFWriter.writeVCF(extract.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), defaultToolVCFHeaderLines, logger);
    SVVCFWriter.writeVCF(extract.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, vcfHeader.getSequenceDictionary(), defaultToolVCFHeaderLines, logger);
}

Example #9

Source File: WordCount.java From Apache-Spark-2x-for-Java-Developers with MIT License

6 votes

public static void wordCountJava8( String filename )
{
    // Define a configuration to use to interact with Spark
    SparkConf conf = new SparkConf().setMaster("local").setAppName("Work Count App");

    // Create a Java version of the Spark Context from the configuration
    JavaSparkContext sc = new JavaSparkContext(conf);

    // Load the input data, which is a text file read from the command line
    JavaRDD<String> input = sc.textFile( filename );

    // Java 8 with lambdas: split the input string into words
   // TODO here a change has happened 
    JavaRDD<String> words = input.flatMap( s -> Arrays.asList( s.split( " " ) ).iterator() );

    // Java 8 with lambdas: transform the collection of words into pairs (word and 1) and then count them
    JavaPairRDD<Object, Object> counts = words.mapToPair( t -> new Tuple2( t, 1 ) ).reduceByKey( (x, y) -> (int)x + (int)y );

    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile( "output" );
}

Example #10

Source File: VariantsSparkSinkUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

private void assertSingleShardedWritingWorks(String vcf, String outputPath, boolean writeTabixIndex) throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();

    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<VariantContext> variants = variantsSparkSource.getParallelVariantContexts(vcf, null);
    if (variants.getNumPartitions() == 1) {
        variants = variants.repartition(3); // repartition to more than 1 partition
    }
    VCFHeader header = getHeader(vcf);

    VariantsSparkSink.writeVariants(ctx, outputPath, variants, header, writeTabixIndex);

    checkFileExtensionConsistentWithContents(outputPath, writeTabixIndex);

    JavaRDD<VariantContext> variants2 = variantsSparkSource.getParallelVariantContexts(outputPath, null);
    final List<VariantContext> writtenVariants = variants2.collect();

    VariantContextTestUtils.assertEqualVariants(readVariants(vcf), writtenVariants);
}

Example #11

Source File: JavaKernelDensityEstimationExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaKernelDensityEstimationExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    // an RDD of sample data
    JavaRDD<Double> data = jsc.parallelize(
      Arrays.asList(1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0, 8.0, 9.0, 9.0));

    // Construct the density estimator with the sample data
    // and a standard deviation for the Gaussian kernels
    KernelDensity kd = new KernelDensity().setSample(data).setBandwidth(3.0);

    // Find density estimates for the given values
    double[] densities = kd.estimate(new double[]{-1.0, 2.0, 5.0});

    System.out.println(Arrays.toString(densities));
    // $example off$

    jsc.stop();
  }

Example #12

Source File: Grep.java From flink-perf with Apache License 2.0

5 votes

public static void main(String[] args) {
	String master = args[0];
	String inFile = args[1];
	String outFile = args[2];

	String patterns[] = new String[args.length-3];
	System.arraycopy(args,3,patterns,0,args.length-3);
	System.err.println("Starting spark with master="+master+" in="+inFile);
	System.err.println("Using patterns: "+ Arrays.toString(patterns));

	SparkConf conf = new SparkConf().setAppName("Grep job").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false");
	JavaSparkContext sc = new JavaSparkContext(conf);

	JavaRDD<String> file = sc.textFile(inFile);
	for(int p = 0; p < patterns.length; p++) {
		final String pattern = patterns[p];
		JavaRDD<String> res = file.filter(new Function<String, Boolean>() {
			private static final long serialVersionUID = 1L;
			Pattern p = Pattern.compile(pattern);

			@Override
			public Boolean call(String value) throws Exception {
				if (value == null || value.length() == 0) {
					return false;
				}
				final Matcher m = p.matcher(value);
				if (m.find()) {
					return true;
				}
				return false;
			}
		});
		res.saveAsTextFile(outFile+"_"+pattern);
	}
}

Example #13

Source File: JDBCDataSource.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
//		SparkConf conf = new SparkConf().setAppName("JDBCDataSource").setMaster("local");
		JavaSparkContext sc = SparkUtils.getRemoteSparkContext(JDBCDataSource.class);
		SQLContext sqlContext = new SQLContext(sc);

		Map<String, String> options = new HashMap<String, String>();
		options.put("url", "jdbc:mysql://192.168.2.129:3306/hive");
		options.put("dbtable", "t_user");
		options.put("user", "root");
		options.put("password", "666666");

		// 加载jdbc数据配置信息 并不会立即连接数据库
		Dataset<Row> dataset1 = sqlContext.read().format("jdbc").options(options).load();

		//		options.put("dbtable", "tb_item");
		//		DataFrame dataFrame2 = sqlContext.read().format("jdbc").options(options).load();

		// 读取jdbc表数据
		dataset1.javaRDD().foreach(new VoidFunction<Row>() {
			@Override
			public void call(Row row) throws Exception {
				System.out.println(row);
			}
		});


		// 将RDD数据存储到MYSQL中
		saveToMysql( sqlContext, options);

		sc.close();
	}

Example #14

Source File: CacheTest.java From beam with Apache License 2.0

5 votes

/**
 * Test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
@Test
public void cacheCandidatesUpdaterTest() {
  SparkPipelineOptions options = createOptions();
  Pipeline pipeline = Pipeline.create(options);
  PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));

  // First use of pCollection.
  pCollection.apply(Count.globally());
  // Second use of pCollection.
  PCollectionView<List<String>> view = pCollection.apply(View.asList());

  // Internally View.asList() creates a PCollection that underlies the PCollectionView, that
  // PCollection should not be cached as the SparkRunner does not access that PCollection to
  // access the PCollectionView.
  pipeline
      .apply(Create.of("foo", "baz"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext processContext) {
                      if (processContext.sideInput(view).contains(processContext.element())) {
                        processContext.output(processContext.element());
                      }
                    }
                  })
              .withSideInputs(view));

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  SparkRunner.CacheVisitor cacheVisitor =
      new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
  pipeline.traverseTopologically(cacheVisitor);
  assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
  assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}

Example #15

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes,MatrixBlock> toMatrixJavaPairRDD(JavaSparkContext sc, MatrixBlock src,
		int blen, int numParts, boolean inclEmpty) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	List<Tuple2<MatrixIndexes,MatrixBlock>> list = null;

	if( src.getNumRows() <= blen && src.getNumColumns() <= blen ) {
		list = Arrays.asList(new Tuple2<>(new MatrixIndexes(1,1), src));
	}
	else {
		MatrixCharacteristics mc = new MatrixCharacteristics(
			src.getNumRows(), src.getNumColumns(), blen, src.getNonZeros());
		list = LongStream.range(0, mc.getNumBlocks()).parallel()
			.mapToObj(i -> createIndexedMatrixBlock(src, mc, i))
			.filter(kv -> inclEmpty || !kv._2.isEmptyBlock(false))
			.collect(Collectors.toList());
	}

	JavaPairRDD<MatrixIndexes,MatrixBlock> result = (numParts > 1) ?
		sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list);
	
	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Example #16

Source File: JavaTC.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaTC")
    .getOrCreate();

  JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());

  Integer slices = (args.length > 0) ? Integer.parseInt(args[0]): 2;
  JavaPairRDD<Integer, Integer> tc = jsc.parallelizePairs(generateGraph(), slices).cache();

  // Linear transitive closure: each round grows paths by one edge,
  // by joining the graph's edges with the already-discovered paths.
  // e.g. join the path (y, z) from the TC with the edge (x, y) from
  // the graph to obtain the path (x, z).

  // Because join() joins on keys, the edges are stored in reversed order.
  JavaPairRDD<Integer, Integer> edges = tc.mapToPair(
    new PairFunction<Tuple2<Integer, Integer>, Integer, Integer>() {
      @Override
      public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> e) {
        return new Tuple2<>(e._2(), e._1());
      }
  });

  long oldCount;
  long nextCount = tc.count();
  do {
    oldCount = nextCount;
    // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
    // then project the result to obtain the new (x, z) paths.
    tc = tc.union(tc.join(edges).mapToPair(ProjectFn.INSTANCE)).distinct().cache();
    nextCount = tc.count();
  } while (nextCount != oldCount);

  System.out.println("TC has " + tc.count() + " edges.");
  spark.stop();
}

Example #17

Source File: FindBreakpointEvidenceSpark.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Kmerize reads having template names in a given set,
 * filter out low complexity kmers and kmers that appear too often in the genome to be helpful in localizing reads,
 * kill intervals that have too few surviving kmers.
 * The return is a Tuple2 in which
 * _1 describes the intervals that have been killed for having too few kmers (as a map from intervalId onto an explanatory string),
 * and _2 describes the good kmers that we want to use in local assemblies (as a multimap from kmer onto intervalId).
 */
private static Tuple2<List<AlignedAssemblyOrExcuse>, HopscotchUniqueMultiMap<SVKmer, Integer, KmerAndInterval>> getKmerAndIntervalsSet(
        final FindBreakpointEvidenceSparkArgumentCollection params,
        final ReadMetadata readMetadata,
        final JavaSparkContext ctx,
        final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap,
        final int nIntervals,
        final JavaRDD<GATKRead> unfilteredReads,
        final SVReadFilter filter,
        final Logger logger)
{
    final Set<SVKmer> kmerKillSet =
            SVFileUtils.readKmersFile(params.kmersToIgnoreFile, params.kSize);
    if ( params.adapterSequence != null ) {
        SVKmerizer.stream(params.adapterSequence, params.kSize, 0, new SVKmerLong())
                .forEach(kmer -> kmerKillSet.add(kmer.canonical(params.kSize)));
    }
    log("Ignoring " + kmerKillSet.size() + " genomically common kmers.", logger);

    final Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> kmerIntervalsAndDispositions =
            getKmerIntervals(params, readMetadata, ctx, qNamesMultiMap, nIntervals, kmerKillSet,
                                unfilteredReads, filter, logger);
    final HopscotchUniqueMultiMap<SVKmer, Integer, KmerAndInterval> kmerMultiMap =
            new HopscotchUniqueMultiMap<>(kmerIntervalsAndDispositions._2());
    log("Discovered " + kmerMultiMap.size() + " kmers.", logger);

    return new Tuple2<>(kmerIntervalsAndDispositions._1(), kmerMultiMap);
}

Example #18

Source File: BulkInsertPreppedDeltaCommitActionExecutor.java From hudi with Apache License 2.0

5 votes

public BulkInsertPreppedDeltaCommitActionExecutor(JavaSparkContext jsc,
    HoodieWriteConfig config, HoodieTable table,
    String instantTime, JavaRDD<HoodieRecord<T>> preppedInputRecordRdd,
    Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
  super(jsc, config, table, instantTime, WriteOperationType.BULK_INSERT);
  this.preppedInputRecordRdd = preppedInputRecordRdd;
  this.bulkInsertPartitioner = bulkInsertPartitioner;
}

Example #19

Source File: JavaEmbeddedIgniteRDDSelfTest.java From ignite with Apache License 2.0

5 votes

/**
 * @throws Exception If failed.
 */
@Test
public void testStoreDataToIgnite() throws Exception {
    JavaSparkContext sc = createContext();

    JavaIgniteContext<String, String> ic = null;

    try {
        ic = new JavaIgniteContext<>(sc, new IgniteConfigProvider(), false);

        ic.fromCache(PARTITIONED_CACHE_NAME)
            .savePairs(sc.parallelize(F.range(0, KEYS_CNT), GRID_CNT).mapToPair(TO_PAIR_F), true, false);

        Ignite ignite = ic.ignite();

        IgniteCache<String, String> cache = ignite.cache(PARTITIONED_CACHE_NAME);

        for (int i = 0; i < KEYS_CNT; i++) {
            String val = cache.get(String.valueOf(i));

            assertNotNull("Value was not put to cache for key: " + i, val);
            assertEquals("Invalid value stored for key: " + i, "val" + i, val);
        }
    }
    finally {
        if (ic != null)
            ic.close(true);

        sc.stop();
    }
}

Example #20

Source File: JavaSQLDataSourceExample.java From SparkDemo with MIT License

5 votes

private static void runJsonDatasetExample(SparkSession spark) {
  // $example on:json_dataset$
  // A JSON dataset is pointed to by path.
  // The path can be either a single text file or a directory storing text files
  Dataset<Row> people = spark.read().json(Constant.LOCAL_FILE_PREX +"/data/resources/people.json");

  // The inferred schema can be visualized using the printSchema() method
  people.printSchema();
  // root
  //  |-- age: long (nullable = true)
  //  |-- name: string (nullable = true)

  // Creates a temporary view using the DataFrame
  people.createOrReplaceTempView("people");

  // SQL statements can be run by using the sql methods provided by spark
  Dataset<Row> namesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19");
  namesDF.show();
  // +------+
  // |  name|
  // +------+
  // |Justin|
  // +------+

  // Alternatively, a DataFrame can be created for a JSON dataset represented by
  // an RDD[String] storing one JSON object per string.
  List<String> jsonData = Arrays.asList(
          "{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
  JavaRDD<String> anotherPeopleRDD =
          new JavaSparkContext(spark.sparkContext()).parallelize(jsonData);
  Dataset anotherPeople = spark.read().json(anotherPeopleRDD);
  anotherPeople.show();
  // +---------------+----+
  // |        address|name|
  // +---------------+----+
  // |[Columbus,Ohio]| Yin|
  // +---------------+----+
  // $example off:json_dataset$
}

Example #21

Source File: PSScorerTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test(dataProvider = "mapPairs", groups = "spark")
public void testMapGroupedReadsToTax(final int readLength, final List<Integer> NM1, final List<Integer> NM2,
                                     final List<Integer> clip1, final List<Integer> clip2,
                                     final List<Integer> insert1, final List<Integer> insert2,
                                     final List<Integer> delete1, final List<Integer> delete2,
                                     final List<String> contig1, final List<String> contig2,
                                     final List<Integer> truthTax) {

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase);

    //Test with alternate alignments assigned to the XA tag
    final List<Iterable<GATKRead>> readListXA = new ArrayList<>();
    readListXA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "XA"));
    final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoXA = resultXA.first()._2;

    Assert.assertNotNull(infoXA);
    Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoXA.numMates, 2);

    //Test SA tag
    final List<Iterable<GATKRead>> readListSA = new ArrayList<>();
    readListSA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "SA"));
    final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoSA = resultSA.first()._2;

    Assert.assertNotNull(infoSA);
    Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoSA.numMates, 2);
}

Example #22

Source File: SparkSegmentTarPushJob.java From incubator-pinot with Apache License 2.0

5 votes

@Override
public void run()
    throws Exception {
  if (!_enableParallelPush) {
    super.run();
  } else {
    List<Path> segmentPathsToPush = getDataFilePaths(_segmentPattern);
    retainRecentFiles(segmentPathsToPush, _lookBackPeriod);
    List<String> segmentsToPush = new ArrayList<>();
    segmentPathsToPush.forEach(path -> {
      segmentsToPush.add(path.toString());
    });
    JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate());
    if (_pushJobParallelism == -1) {
      _pushJobParallelism = segmentsToPush.size();
    }
    JavaRDD<String> pathRDD = sparkContext.parallelize(segmentsToPush, _pushJobParallelism);
    pathRDD.foreach(segmentTarPath -> {
      try (ControllerRestApi controllerRestApi = getControllerRestApi()) {
        FileSystem fileSystem = FileSystem.get(new Path(segmentTarPath).toUri(), new Configuration());
        // TODO: Deal with invalid prefixes in the future
        List<String> currentSegments = controllerRestApi.getAllSegments("OFFLINE");
        controllerRestApi.pushSegments(fileSystem, Arrays.asList(new Path(segmentTarPath)));
        if (_deleteExtraSegments) {
          controllerRestApi
              .deleteSegmentUris(getSegmentsToDelete(currentSegments, Arrays.asList(new Path(segmentTarPath))));
        }
      }
    });
  }
}

Example #23

Source File: KMeansUpdate.java From oryx with Apache License 2.0

5 votes

/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
                       JavaRDD<String> trainData,
                       List<?> hyperParameters,
                       Path candidatePath) {
  int numClusters = (Integer) hyperParameters.get(0);
  Preconditions.checkArgument(numClusters > 1);
  log.info("Building KMeans Model with {} clusters", numClusters);

  JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
  KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, initializationStrategy);

  return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}

Example #24

Source File: TestStreamingStep.java From envelope with Apache License 2.0

5 votes

public JavaRDD<String> generateRDD() {
  Random values = new Random();
  values.setSeed(System.currentTimeMillis());
  List<String> list = Lists.newLinkedList();
  for (int i = 0; i < batchSize; i++) {
    list.add(String.valueOf(values.nextLong()));
  }
  SparkContext sc = Contexts.getSparkSession().sparkContext();
  JavaSparkContext jsc = JavaSparkContext.fromSparkContext(sc);
  return jsc.parallelize(list,this.partitions);
}

Example #25

Source File: CNLOHCaller.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

private double[] calcNewRhos(final List<ACNVModeledSegment> segments,
                             final List<double[][][]> responsibilitiesBySeg,
                             final double lambda, final double[] rhos, final int[] mVals, final int[] nVals,
                             final JavaSparkContext ctx) {

    // Since, we pass in the entire responsibilities matrix, we need the correct index for each rho.  That, and the
    //  fact that this is a univariate objective function, means we need to create an instance for each rho.  And
    //  then we blast across Spark.

    final List<Pair<? extends Function<Double, Double>, SearchInterval>> objectives = IntStream.range(0, rhos.length)
            .mapToObj(i -> new Pair<>(
                    new Function<Double, Double>() {
                        @Override
                        public Double apply(Double rho) {
                            return calculateESmnObjective(rho, segments, responsibilitiesBySeg, mVals, nVals, lambda, i);
                        }
                    },
                    new SearchInterval(0.0, 1.0, rhos[i])))
            .collect(Collectors.toList());

    final JavaRDD<Pair<? extends Function<Double, Double>, SearchInterval>> objectivesRDD = ctx.parallelize(objectives);

    final List<Double> resultsAsDouble = objectivesRDD
            .map(objective -> optimizeIt(objective.getFirst(), objective.getSecond()))
            .collect();

    return resultsAsDouble.stream().mapToDouble(Double::doubleValue).toArray();
}

Example #26

Source File: SparkUtils.java From deeplearning4j with Apache License 2.0

5 votes

public static Broadcast<byte[]> asByteArrayBroadcast(JavaSparkContext sc, INDArray array){
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    try {
        Nd4j.write(array, new DataOutputStream(baos));
    } catch (IOException e){
        throw new RuntimeException(e);  //Should never happen
    }
    byte[] paramBytes = baos.toByteArray();       //See docs in EvaluationRunner for why we use byte[] instead of INDArray (thread locality etc)
    return sc.broadcast(paramBytes);
}

Example #27

Source File: TraverseStructureHierarchy.java From mmtf-spark with Apache License 2.0

5 votes

public static void main(String args[]) {

		// instantiate Spark. Each Spark application needs these two lines of code.
		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(ReadMmtfReduced.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		//	    List<String> pdbIds = Arrays.asList("5UTV"); // multiple models
		//	    List<String> pdbIds = Arrays.asList("1BZ1"); // multiple protein chains
		//      List<String> pdbIds = Arrays.asList("1STP"); // single protein chain
		List<String> pdbIds = Arrays.asList("1HV4"); // structure with 2 bioassemblies
		//	    List<String> pdbIds = Arrays.asList("2NBK"); // single protein chain
		JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); 

		pdb.foreach(t -> TraverseStructureHierarchy.printAll(t._2));      
	}

Example #28

Source File: CassandraDependenciesJob.java From spark-dependencies with Apache License 2.0

5 votes

private String dependenciesTable(JavaSparkContext sc) {
  try {
    javaFunctions(sc)
      .cassandraTable(keyspace, "dependencies_v2")
      .limit(1L).collect();
  } catch (Exception ex) {
    return "dependencies";
  }
  return "dependencies_v2";
}

Example #29

Source File: TransformationRDDTest.java From hui-bigdata-spark with Apache License 2.0

5 votes

/**
     * 初始化
     *
     * @throws Exception the exception
     * @since hui_project 1.0.0
     */
    @Before
    public void before() throws Exception {
        sparkConf = new SparkConf().setMaster("local[4]").setAppName("test");
//        sparkConf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer");
        sparkContext = new JavaSparkContext(sparkConf);
    }

Example #30

Source File: TestEarlyStoppingSpark.java From deeplearning4j with Apache License 2.0

5 votes

private JavaRDD<DataSet> getIris() {

        JavaSparkContext sc = getContext();

        IrisDataSetIterator iter = new IrisDataSetIterator(irisBatchSize(), 150);
        List<DataSet> list = new ArrayList<>(150);
        while (iter.hasNext())
            list.add(iter.next());

        return sc.parallelize(list);
    }