Java Code Examples for org.apache.spark.api.java.JavaSparkContext#broadcast()

The following examples show how to use org.apache.spark.api.java.JavaSparkContext#broadcast() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testWordFreqAccNotIdentifyingStopWords() throws Exception {

    JavaSparkContext sc = getContext();
    //  word2vec.setRemoveStop(false);
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
    pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);

    Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
    assertEquals(wordFreqCounter.getCount("is"), 1, 0);
    assertEquals(wordFreqCounter.getCount("this"), 1, 0);
    assertEquals(wordFreqCounter.getCount("are"), 1, 0);
    assertEquals(wordFreqCounter.getCount("a"), 1, 0);
    assertEquals(wordFreqCounter.getCount("strange"), 2, 0);
    assertEquals(wordFreqCounter.getCount("flowers"), 1, 0);
    assertEquals(wordFreqCounter.getCount("world"), 1, 0);
    assertEquals(wordFreqCounter.getCount("red"), 1, 0);

    sc.stop();
}
 
Example 2
Source File: GraknSparkMemory.java    From grakn with GNU Affero General Public License v3.0 6 votes vote down vote up
public GraknSparkMemory(final VertexProgram<?> vertexProgram,
                        final Set<MapReduce> mapReducers,
                        final JavaSparkContext sparkContext) {
    if (null != vertexProgram) {
        for (final MemoryComputeKey key : vertexProgram.getMemoryComputeKeys()) {
            this.memoryComputeKeys.put(key.getKey(), key);
        }
    }
    for (final MapReduce mapReduce : mapReducers) {
        this.memoryComputeKeys.put(
                mapReduce.getMemoryKey(),
                MemoryComputeKey.of(mapReduce.getMemoryKey(), Operator.assign, false, false));
    }
    for (final MemoryComputeKey memoryComputeKey : this.memoryComputeKeys.values()) {
        this.sparkMemory.put(
                memoryComputeKey.getKey(),
                sparkContext.accumulator(ObjectWritable.empty(), memoryComputeKey.getKey(),
                        new MemoryAccumulator<>(memoryComputeKey)));
    }
    this.broadcast = sparkContext.broadcast(Collections.emptyMap());
}
 
Example 3
Source File: BQSRPipelineSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
    List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

    //Should this get the getUnfilteredReads? getReads will merge default and command line filters.
    //but the code below uses other filters for other parts of the pipeline that do not honor
    //the commandline.
    final JavaRDD<GATKRead> initialReads = getReads();

    // The initial reads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
    // performed, so we do that here.
    //NOTE: this filter doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));

    JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(filteredReadsForBQSR, localKnownSitesFilePaths);
    //note: we use the reference dictionary from the reads themselves.
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

    writeReads(ctx, output, finalReads);
}
 
Example 4
Source File: SvDiscoveryInputMetaData.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public SvDiscoveryInputMetaData(final JavaSparkContext ctx,
                                final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs,
                                final String nonCanonicalChromosomeNamesFile,
                                final String outputPath,
                                final ReadMetadata readMetadata,
                                final List<SVInterval> assembledIntervals,
                                final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceTargetLinks,
                                final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast,
                                final SAMFileHeader headerForReads,
                                final ReferenceMultiSparkSource reference,
                                final Set<VCFHeaderLine> defaultToolVCFHeaderLines,
                                final Logger toolLogger) {

    final SAMSequenceDictionary sequenceDictionary = headerForReads.getSequenceDictionary();
    final Broadcast<Set<String>> canonicalChromosomesBroadcast =
            ctx.broadcast(SVUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
    final String sampleId = SVUtils.getSampleId(headerForReads);

    this.referenceData = new ReferenceData(canonicalChromosomesBroadcast, ctx.broadcast(reference), ctx.broadcast(sequenceDictionary));
    this.sampleSpecificData = new SampleSpecificData(sampleId, cnvCallsBroadcast, assembledIntervals, evidenceTargetLinks, readMetadata, ctx.broadcast(headerForReads));
    this.discoverStageArgs = discoverStageArgs;
    this.outputPath = outputPath;
    this.defaultToolVCFHeaderLines = defaultToolVCFHeaderLines;
    this.toolLogger = toolLogger;
}
 
Example 5
Source File: ApplyBQSRSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected void runTool(JavaSparkContext ctx) {
    JavaRDD<GATKRead> initialReads = getReads();
    Broadcast<RecalibrationReport> recalibrationReportBroadCast = ctx.broadcast(new RecalibrationReport(BucketUtils.openFile(bqsrRecalFile)));
    final JavaRDD<GATKRead> recalibratedReads = ApplyBQSRSparkFn.apply(initialReads, recalibrationReportBroadCast, getHeaderForReads(), applyBQSRArgs);
    writeReads(ctx, output, recalibratedReads);
}
 
Example 6
Source File: SparkMemory.java    From tinkerpop with Apache License 2.0 5 votes vote down vote up
protected void broadcastMemory(final JavaSparkContext sparkContext) {
    this.broadcast.destroy(true); // do we need to block?
    final Map<String, Object> toBroadcast = new HashMap<>();
    this.sparkMemory.forEach((key, object) -> {
        if (!object.value().isEmpty() && this.memoryComputeKeys.get(key).isBroadcast())
            toBroadcast.put(key, object.value());
    });
    this.broadcast = sparkContext.broadcast(toBroadcast);
}
 
Example 7
Source File: PSScorerTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "mapPairs", groups = "spark")
public void testMapGroupedReadsToTax(final int readLength, final List<Integer> NM1, final List<Integer> NM2,
                                     final List<Integer> clip1, final List<Integer> clip2,
                                     final List<Integer> insert1, final List<Integer> insert2,
                                     final List<Integer> delete1, final List<Integer> delete2,
                                     final List<String> contig1, final List<String> contig2,
                                     final List<Integer> truthTax) {

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase);

    //Test with alternate alignments assigned to the XA tag
    final List<Iterable<GATKRead>> readListXA = new ArrayList<>();
    readListXA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "XA"));
    final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoXA = resultXA.first()._2;

    Assert.assertNotNull(infoXA);
    Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoXA.numMates, 2);

    //Test SA tag
    final List<Iterable<GATKRead>> readListSA = new ArrayList<>();
    readListSA.add(generateReadPair(readLength, NM1, NM2, clip1, clip2, insert1, insert2, delete1, delete2, contig1, contig2, "SA"));
    final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoSA = resultSA.first()._2;

    Assert.assertNotNull(infoSA);
    Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoSA.numMates, 2);
}
 
Example 8
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilterMinWordAddVocab() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
    pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);
    Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();

    pipeline.filterMinWordAddVocab(wordFreqCounter);
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();

    assertTrue(vocabCache != null);

    VocabWord redVocab = vocabCache.tokenFor("red");
    VocabWord flowerVocab = vocabCache.tokenFor("flowers");
    VocabWord worldVocab = vocabCache.tokenFor("world");
    VocabWord strangeVocab = vocabCache.tokenFor("strange");


    assertEquals(redVocab.getWord(), "red");
    assertEquals(redVocab.getElementFrequency(), 1, 0);

    assertEquals(flowerVocab.getWord(), "flowers");
    assertEquals(flowerVocab.getElementFrequency(), 1, 0);

    assertEquals(worldVocab.getWord(), "world");
    assertEquals(worldVocab.getElementFrequency(), 1, 0);

    assertEquals(strangeVocab.getWord(), "strange");
    assertEquals(strangeVocab.getElementFrequency(), 2, 0);

    sc.stop();
}
 
Example 9
Source File: BoxClient.java    From render with GNU General Public License v2.0 5 votes vote down vote up
public void run(final SparkConf sparkConf)
        throws IOException {

    final JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);

    LogUtilities.logSparkClusterInfo(sparkContext);

    setupForRun();

    boolean foundBoxesRenderedForPriorRun = false;
    if (parameters.cleanUpPriorRun) {
        foundBoxesRenderedForPriorRun = cleanUpPriorRun(sparkContext);
    }

    final JavaRDD<BoxData> distributedBoxDataRdd = partitionBoxes(sparkContext,
                                                                  foundBoxesRenderedForPriorRun);

    final Broadcast<BoxGenerator> broadcastBoxGenerator = sparkContext.broadcast(boxGenerator);

    if (parameters.validateLabelsOnly) {
        validateLabelBoxes(sparkContext, distributedBoxDataRdd);
    } else {
        for (int level = 0; level <= parameters.box.maxLevel; level++) {
            renderBoxesForLevel(level, distributedBoxDataRdd, broadcastBoxGenerator);
        }
    }

    if (parameters.box.isOverviewNeeded() && (! parameters.explainPlan) && (! parameters.validateLabelsOnly)) {
        renderOverviewImages(sparkContext,
                             broadcastBoxGenerator);
    }

    LogUtilities.logSparkClusterInfo(sparkContext); // log cluster info again here to add run stats to driver log

    sparkContext.stop();
}
 
Example 10
Source File: ReadsSparkSink.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static void writeReads(
        final JavaSparkContext ctx, final String outputFile, final GATKPath referencePathSpecifier, final JavaRDD<SAMRecord> reads,
        final SAMFileHeader header, final long sbiIndexGranularity, final WriteOption... writeOptions) throws IOException {

    Broadcast<SAMFileHeader> headerBroadcast = ctx.broadcast(header);
    final JavaRDD<SAMRecord> sortedReadsWithHeader = reads.map(read -> {
        read.setHeaderStrict(headerBroadcast.getValue());
        return read;
    });
    HtsjdkReadsRdd htsjdkReadsRdd = new HtsjdkReadsRdd(header, sortedReadsWithHeader);
    HtsjdkReadsRddStorage.makeDefault(ctx)
            .referenceSourcePath(referencePathSpecifier == null ? null : referencePathSpecifier.getRawInputString())
            .sbiIndexGranularity(sbiIndexGranularity)
            .write(htsjdkReadsRdd, outputFile, writeOptions);
}
 
Example 11
Source File: FindBreakpointEvidenceSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Transform all the reads for a supplied set of template names in each interval into FASTQ records
 * for each interval, and do something with the list of FASTQ records for each interval (like write it to a file).
 */
@VisibleForTesting static List<AlignedAssemblyOrExcuse> handleAssemblies(
        final JavaSparkContext ctx,
        final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap,
        final JavaRDD<GATKRead> unfilteredReads,
        final SVReadFilter filter,
        final int nIntervals,
        final boolean includeMappingLocation,
        final LocalAssemblyHandler localAssemblyHandler ) {

    final int[] counts = new int[nIntervals];
    for ( final QNameAndInterval qNameAndInterval : qNamesMultiMap ) {
        counts[qNameAndInterval.getIntervalId()] += 1;
    }
    final ComplexityPartitioner partitioner = new ComplexityPartitioner(counts);

    final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNamesMultiMap =
            ctx.broadcast(qNamesMultiMap);
    final List<AlignedAssemblyOrExcuse> intervalDispositions =
        unfilteredReads
            .mapPartitionsToPair(readItr ->
                    new ReadsForQNamesFinder(broadcastQNamesMultiMap.value(), nIntervals,
                            includeMappingLocation, readItr, filter).iterator(), false)
            .combineByKey(x -> x,
                            SVUtils::concatenateLists,
                            SVUtils::concatenateLists,
                            partitioner, false, null)
            .map(localAssemblyHandler::apply)
            .collect();

    SparkUtils.destroyBroadcast(broadcastQNamesMultiMap, "QNames multi map");
    BwaMemIndexCache.closeAllDistributedInstances(ctx);

    return intervalDispositions;
}
 
Example 12
Source File: PSScorerTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(dataProvider = "mapUnpaired", groups = "spark")
public void testMapGroupedReadsToTaxUnpaired(final int readLength, final List<Integer> NM, final List<Integer> clip,
                                             final List<Integer> insert, final List<Integer> delete,
                                             final List<String> contig, final List<Integer> truthTax) {

    if (!(NM.size() == clip.size() && NM.size() == insert.size() && NM.size() == delete.size() && NM.size() == contig.size())) {
        throw new TestException("Input lists for read must be of uniform length");
    }

    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final Broadcast<PSTaxonomyDatabase> taxonomyDatabaseBroadcast = ctx.broadcast(taxonomyDatabase);

    //Test with alternate alignments assigned to the XA tag
    final List<Iterable<GATKRead>> readListXA = new ArrayList<>();
    readListXA.add(generateUnpairedRead(readLength, NM, clip, insert, delete, contig, "XA"));
    final JavaRDD<Iterable<GATKRead>> pairsXA = ctx.parallelize(readListXA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultXA = PSScorer.mapGroupedReadsToTax(pairsXA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoXA = resultXA.first()._2;

    Assert.assertNotNull(infoXA);
    Assert.assertEquals(infoXA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoXA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoXA.numMates, 1);

    //Test SA tag
    final List<Iterable<GATKRead>> readListSA = new ArrayList<>();
    readListSA.add(generateUnpairedRead(readLength, NM, clip, insert, delete, contig, "SA"));
    final JavaRDD<Iterable<GATKRead>> pairsSA = ctx.parallelize(readListSA);
    final JavaRDD<Tuple2<Iterable<GATKRead>, PSPathogenAlignmentHit>> resultSA = PSScorer.mapGroupedReadsToTax(pairsSA,
            MIN_IDENT, IDENT_MARGIN, taxonomyDatabaseBroadcast);
    final PSPathogenAlignmentHit infoSA = resultSA.first()._2;

    Assert.assertNotNull(infoSA);
    Assert.assertEquals(infoSA.taxIDs.size(), truthTax.size());
    Assert.assertTrue(infoSA.taxIDs.containsAll(truthTax));
    Assert.assertEquals(infoSA.numMates, 1);
}
 
Example 13
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static Broadcast<byte[]> asByteArrayBroadcast(JavaSparkContext sc, INDArray array){
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    try {
        Nd4j.write(array, new DataOutputStream(baos));
    } catch (IOException e){
        throw new RuntimeException(e);  //Should never happen
    }
    byte[] paramBytes = baos.toByteArray();       //See docs in EvaluationRunner for why we use byte[] instead of INDArray (thread locality etc)
    return sc.broadcast(paramBytes);
}
 
Example 14
Source File: BroadcastHadoopConfigHolder.java    From deeplearning4j with Apache License 2.0 5 votes vote down vote up
public static Broadcast<SerializableHadoopConfig> get(JavaSparkContext sc){
    if(config != null && (!config.isValid() || sc.startTime() != sparkContextStartTime) ){
        config = null;
    }
    if(config != null){
        return config;
    }
    synchronized (BroadcastHadoopConfigHolder.class){
        if(config == null){
            config = sc.broadcast(new SerializableHadoopConfig(sc.hadoopConfiguration()));
            sparkContextStartTime = sc.startTime();
        }
    }
    return config;
}
 
Example 15
Source File: PathSeqPipelineSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {

    filterArgs.doReadFilterArgumentWarnings(getCommandLineParser().getPluginDescriptor(GATKReadFilterPluginDescriptor.class), logger);
    SAMFileHeader header = PSUtils.checkAndClearHeaderSequences(getHeaderForReads(), filterArgs, logger);

    //Do not allow use of numReducers
    if (numReducers > 0) {
        throw new UserException.BadInput("Use --readsPerPartitionOutput instead of --num-reducers.");
    }

    //Filter
    final Tuple2<JavaRDD<GATKRead>, JavaRDD<GATKRead>> filterResult;
    final PSFilter filter = new PSFilter(ctx, filterArgs, header);
    try (final PSFilterLogger filterLogger = filterArgs.filterMetricsFileUri != null ? new PSFilterFileLogger(getMetricsFile(), filterArgs.filterMetricsFileUri) : new PSFilterEmptyLogger()) {
        final JavaRDD<GATKRead> inputReads = getReads();
        filterResult = filter.doFilter(inputReads, filterLogger);
    }
    JavaRDD<GATKRead> pairedReads = filterResult._1;
    JavaRDD<GATKRead> unpairedReads = filterResult._2;

    //Counting forces an action on the RDDs to guarantee we're done with the Bwa image and kmer filter
    final long numPairedReads = pairedReads.count();
    final long numUnpairedReads = unpairedReads.count();
    final long numTotalReads = numPairedReads + numUnpairedReads;

    //Closes Bwa image, kmer filter, and metrics file if used
    //Note the host Bwa image before must be unloaded before trying to load the pathogen image
    filter.close();

    //Rebalance partitions using the counts
    final int numPairedPartitions = 1 + (int) (numPairedReads / readsPerPartition);
    final int numUnpairedPartitions = 1 + (int) (numUnpairedReads / readsPerPartition);
    pairedReads = repartitionPairedReads(pairedReads, numPairedPartitions, numPairedReads);
    unpairedReads = unpairedReads.repartition(numUnpairedPartitions);

    //Bwa pathogen alignment
    final PSBwaAlignerSpark aligner = new PSBwaAlignerSpark(ctx, bwaArgs);
    PSBwaUtils.addReferenceSequencesToHeader(header, bwaArgs.microbeDictionary);
    final Broadcast<SAMFileHeader> headerBroadcast = ctx.broadcast(header);
    JavaRDD<GATKRead> alignedPairedReads = aligner.doBwaAlignment(pairedReads, true, headerBroadcast);
    JavaRDD<GATKRead> alignedUnpairedReads = aligner.doBwaAlignment(unpairedReads, false, headerBroadcast);

    //Cache this expensive result. Note serialization significantly reduces memory consumption.
    alignedPairedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());
    alignedUnpairedReads.persist(StorageLevel.MEMORY_AND_DISK_SER());

    //Score pathogens
    final PSScorer scorer = new PSScorer(scoreArgs);
    final JavaRDD<GATKRead> readsFinal = scorer.scoreReads(ctx, alignedPairedReads, alignedUnpairedReads, header);

    //Clean up header
    header = PSBwaUtils.removeUnmappedHeaderSequences(header, readsFinal, logger);

    //Log read counts
    if (scoreArgs.scoreMetricsFileUri != null) {
        try (final PSScoreLogger scoreLogger = new PSScoreFileLogger(getMetricsFile(), scoreArgs.scoreMetricsFileUri)) {
            scoreLogger.logReadCounts(readsFinal);
        }
    }

    //Write reads to BAM, if specified
    if (outputPath != null) {
        try {
            //Reduce number of partitions since we previously went to ~5K reads per partition, which
            // is far too small for sharded output.
            final int numPartitions = Math.max(1, (int) (numTotalReads / readsPerPartitionOutput));
            final JavaRDD<GATKRead> readsFinalRepartitioned = readsFinal.coalesce(numPartitions, false);
            ReadsSparkSink.writeReads(ctx, outputPath, null, readsFinalRepartitioned, header,
                    shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, numPartitions, shardedPartsDir, true, splittingIndexGranularity);
        } catch (final IOException e) {
            throw new UserException.CouldNotCreateOutputFile(outputPath, "writing failed", e);
        }
    }
    aligner.close();
}
 
Example 16
Source File: ConceptMaps.java    From bunsen with Apache License 2.0 4 votes vote down vote up
@Override
public Broadcast<BroadcastableMappings> broadcast(Map<String,String> conceptMapUriToVersion) {

  List<ConceptMap> mapsList = getMaps().collectAsList();

  Map<String,ConceptMap> mapsToLoad = mapsList
      .stream()
      .filter(conceptMap ->
          conceptMap.getVersion().equals(conceptMapUriToVersion.get(conceptMap.getUrl())))
      .collect(Collectors.toMap(ConceptMap::getUrl, Function.identity()));

  // Expand the concept maps to load and sort them so dependencies are before
  // their dependents in the list.
  List<String> sortedMapsToLoad = sortMapsToLoad(conceptMapUriToVersion.keySet(), mapsToLoad);

  // Since this is used to map from one system to another, we use only targets
  // that don't introduce inaccurate meanings. (For instance, we can't map
  // general condition code to a more specific type, since that is not
  // representative of the source data.)
  Dataset<Mapping> mappings = getMappings(conceptMapUriToVersion)
      .filter("equivalence in ('equivalent', 'equals', 'wider', 'subsumes')");

  // Group mappings by their concept map URI
  Map<String, List<Mapping>> groupedMappings =  mappings
      .collectAsList()
      .stream()
      .collect(Collectors.groupingBy(Mapping::getConceptMapUri));

  Map<String, BroadcastableConceptMap> broadcastableMaps = new HashMap<>();

  for (String conceptMapUri: sortedMapsToLoad) {

    ConceptMap map = mapsToLoad.get(conceptMapUri);

    Set<String> children = getMapChildren(map);

    List<BroadcastableConceptMap> childMaps = children.stream()
        .map(child -> broadcastableMaps.get(child))
        .collect(Collectors.toList());

    BroadcastableConceptMap broadcastableConceptMap = new BroadcastableConceptMap(conceptMapUri,
        groupedMappings.getOrDefault(conceptMapUri, Collections.emptyList()),
        childMaps);

    broadcastableMaps.put(conceptMapUri, broadcastableConceptMap);
  }

  JavaSparkContext ctx = new JavaSparkContext(getMaps()
      .sparkSession()
      .sparkContext());

  return ctx.broadcast(new BroadcastableMappings(broadcastableMaps));
}
 
Example 17
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
public void testFirstIteration() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    // word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    VocabCache<VocabWord> vocabCache = pipeline.getVocabCache();
    /*        Huffman huffman = new Huffman(vocabCache.vocabWords());
    huffman.build();
    huffman.applyIndexes(vocabCache);
    */
    VocabWord token = vocabCache.tokenFor("strange");
    VocabWord word = vocabCache.wordFor("strange");
    log.info("Strange token: " + token);
    log.info("Strange word: " + word);

    // Get total word count and put into word2vec variable map
    Map<String, Object> word2vecVarMap = word2vec.getWord2vecVarMap();
    word2vecVarMap.put("totalWordCount", pipeline.getTotalWordCount());
    double[] expTable = word2vec.getExpTable();

    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();

    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();

    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD =
                    vocabWordListRDD.zip(sentenceCountCumSumRDD);

    Broadcast<Map<String, Object>> word2vecVarMapBroadcast = sc.broadcast(word2vecVarMap);
    Broadcast<double[]> expTableBroadcast = sc.broadcast(expTable);

    Iterator<Tuple2<List<VocabWord>, Long>> iterator = vocabWordListSentenceCumSumRDD.collect().iterator();

    FirstIterationFunction firstIterationFunction = new FirstIterationFunction(
                    word2vecVarMapBroadcast, expTableBroadcast, pipeline.getBroadCastVocabCache());

    Iterator<Map.Entry<VocabWord, INDArray>> ret = firstIterationFunction.call(iterator);
    assertTrue(ret.hasNext());
}
 
Example 18
Source File: ValueSetUdfs.java    From bunsen with Apache License 2.0 3 votes vote down vote up
/**
 * Pushes an "in_valueset" UDF that uses the given {@link BroadcastableValueSets} for its content.
 *
 * @param spark the spark session
 * @param valueSets the valuesets to use in the UDF
 */
public static synchronized void pushUdf(SparkSession spark, BroadcastableValueSets valueSets) {

  JavaSparkContext ctx = new JavaSparkContext(spark.sparkContext());

  Broadcast<BroadcastableValueSets> broadcast = ctx.broadcast(valueSets);

  pushUdf(spark, broadcast);
}
 
Example 19
Source File: ExtractOriginalAlignmentRecordsByNameSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 3 votes vote down vote up
@Override
protected void runTool( final JavaSparkContext ctx ) {

    final Broadcast<Set<String>> namesToLookForBroadcast = ctx.broadcast(parseReadNames());

    final Function<GATKRead, Boolean> predicate = getGatkReadBooleanFunction(namesToLookForBroadcast, invertFilter);

    final JavaRDD<GATKRead> reads = getUnfilteredReads().filter(predicate).cache();
    writeReads(ctx, outputSAM, reads, getHeaderForReads(), false);

    logger.info("Found " + reads.count() + " alignment records for " +
                namesToLookForBroadcast.getValue().size() + " unique read names.");
}
 
Example 20
Source File: AbstractValueSets.java    From bunsen with Apache License 2.0 3 votes vote down vote up
/**
 * Returns a dataset with the values for each element in the map of uri to version.
 *
 * @param uriToVersion a map of value set URI to the version to load
 * @return a dataset of values for the given URIs and versions.
 */
public Dataset<Value> getValues(Map<String,String> uriToVersion) {

  JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext());

  Broadcast<Map<String,String>> broadcastUrisToVersion = context.broadcast(uriToVersion);

  return this.values.filter((FilterFunction<Value>) value -> {

    String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri());

    return latestVersion != null && latestVersion.equals(value.getValueSetVersion());
  });
}