org.apache.spark.broadcast.Broadcast Java Examples

The following examples show how to use org.apache.spark.broadcast.Broadcast. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FindBreakpointEvidenceSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
static SVIntervalTree<SVInterval> findGenomewideHighCoverageIntervalsToIgnore(final FindBreakpointEvidenceSparkArgumentCollection params,
                                                                              final ReadMetadata readMetadata,
                                                                              final JavaSparkContext ctx,
                                                                              final SAMFileHeader header,
                                                                              final JavaRDD<GATKRead> unfilteredReads,
                                                                              final SVReadFilter filter,
                                                                              final Logger logger,
                                                                              final Broadcast<ReadMetadata> broadcastMetadata) {
    final int capacity = header.getSequenceDictionary().getSequences().stream()
            .mapToInt(seqRec -> (seqRec.getSequenceLength() + DEPTH_WINDOW_SIZE - 1)/DEPTH_WINDOW_SIZE).sum();
    final List<SVInterval> depthIntervals = new ArrayList<>(capacity);
    for (final SAMSequenceRecord sequenceRecord : header.getSequenceDictionary().getSequences()) {
        final int contigID = readMetadata.getContigID(sequenceRecord.getSequenceName());
        final int contigLength = sequenceRecord.getSequenceLength();
        for (int i = 1; i < contigLength; i = i + DEPTH_WINDOW_SIZE) {
            depthIntervals.add(new SVInterval(contigID, i, Math.min(contigLength, i + DEPTH_WINDOW_SIZE)));
        }
    }

    final List<SVInterval> highCoverageSubintervals = findHighCoverageSubintervalsAndLog(
            params, ctx, broadcastMetadata, depthIntervals, unfilteredReads, filter, logger);
    final SVIntervalTree<SVInterval> highCoverageSubintervalTree = new SVIntervalTree<>();
    highCoverageSubintervals.forEach(i -> highCoverageSubintervalTree.put(i, i));

    return highCoverageSubintervalTree;
}
 
Example #2
Source File: ExecuteWorkerPathFlatMap.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
public ExecuteWorkerPathFlatMap(TrainingWorker<R> worker, DataSetLoader dataSetLoader, Broadcast<SerializableHadoopConfig> hadoopConfig) {
    this.workerFlatMap = new ExecuteWorkerFlatMap<>(worker);
    this.dataSetLoader = dataSetLoader;
    this.hadoopConfig = hadoopConfig;

    //How many dataset objects of size 'dataSetObjectNumExamples' should we load?
    //Only pass on the required number, not all of them (to avoid async preloading data that won't be used)
    //Most of the time we'll get exactly the number we want, but this isn't guaranteed all the time for all
    // splitting strategies
    WorkerConfiguration conf = worker.getDataConfiguration();
    int dataSetObjectNumExamples = conf.getDataSetObjectSizeExamples();
    int workerMinibatchSize = conf.getBatchSizePerWorker();
    int maxMinibatches = (conf.getMaxBatchesPerWorker() > 0 ? conf.getMaxBatchesPerWorker() : Integer.MAX_VALUE);

    if (maxMinibatches == Integer.MAX_VALUE) {
        maxDataSetObjects = Integer.MAX_VALUE;
    } else {
        //Required: total number of examples / examples per dataset object
        maxDataSetObjects =
                        (int) Math.ceil(maxMinibatches * workerMinibatchSize / ((double) dataSetObjectNumExamples));
    }
}
 
Example #3
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test @Ignore   //AB 2020/04/20 https://github.com/eclipse/deeplearning4j/issues/8849
public void testCountCumSum() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();

    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    List<Long> sentenceCountCumSumList = sentenceCountCumSumRDD.collect();
    assertTrue(sentenceCountCumSumList.get(0) == 6L);
    assertTrue(sentenceCountCumSumList.get(1) == 9L);

    sc.stop();
}
 
Example #4
Source File: BQSRPipelineSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {
    String referenceFileName = addReferenceFilesForSpark(ctx, referenceArguments.getReferencePath());
    List<String> localKnownSitesFilePaths = addVCFsForSpark(ctx, knownVariants);

    //Should this get the getUnfilteredReads? getReads will merge default and command line filters.
    //but the code below uses other filters for other parts of the pipeline that do not honor
    //the commandline.
    final JavaRDD<GATKRead> initialReads = getReads();

    // The initial reads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that ApplyBQSR wants. BQSR itself wants additional filtering
    // performed, so we do that here.
    //NOTE: this filter doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> filteredReadsForBQSR = initialReads.filter(read -> bqsrReadFilter.test(read));

    JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants = JoinReadsWithVariants.join(filteredReadsForBQSR, localKnownSitesFilePaths);
    //note: we use the reference dictionary from the reads themselves.
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(readsWithVariants, getHeaderForReads(), referenceFileName, bqsrArgs);

    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(initialReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs));

    writeReads(ctx, output, finalReads);
}
 
Example #5
Source File: AnnotatedVariantProducer.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@VisibleForTesting
static VariantContextBuilder annotateWithExternalCNVCalls(final String recordContig, final int pos, final int end,
                                                          final VariantContextBuilder inputBuilder,
                                                          final Broadcast<SAMSequenceDictionary> broadcastSequenceDictionary,
                                                          final Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls,
                                                          final String sampleId) {
    if (broadcastCNVCalls == null)
        return inputBuilder;
    final SVInterval variantInterval = new SVInterval(broadcastSequenceDictionary.getValue().getSequenceIndex(recordContig), pos, end);
    final SVIntervalTree<VariantContext> cnvCallTree = broadcastCNVCalls.getValue();
    final String cnvCallAnnotation =
            Utils.stream(cnvCallTree.overlappers(variantInterval))
                    .map(overlapper -> formatExternalCNVCallAnnotation(overlapper.getValue(), sampleId))
                    .collect(Collectors.joining(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
    if (!cnvCallAnnotation.isEmpty()) {
        return inputBuilder.attribute(GATKSVVCFConstants.EXTERNAL_CNV_CALLS, cnvCallAnnotation);
    } else
        return inputBuilder;
}
 
Example #6
Source File: GeoWaveRDDLoader.java    From geowave with Apache License 2.0 6 votes vote down vote up
public static GeoWaveIndexedRDD loadIndexedRDD(
    final SparkContext sc,
    final DataStorePluginOptions storeOptions,
    final RDDOptions rddOpts,
    final NumericIndexStrategy indexStrategy) throws IOException {
  final GeoWaveRDD wrappedRDD = GeoWaveRDDLoader.loadRDD(sc, storeOptions, rddOpts);
  // Index strategy can be expensive so we will broadcast it and store it
  Broadcast<NumericIndexStrategy> broadcastStrategy = null;
  if (indexStrategy != null) {
    broadcastStrategy =
        (Broadcast<NumericIndexStrategy>) RDDUtils.broadcastIndexStrategy(sc, indexStrategy);
  }

  final GeoWaveIndexedRDD returnRDD = new GeoWaveIndexedRDD(wrappedRDD, broadcastStrategy);
  return returnRDD;
}
 
Example #7
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext> assemblyFunction(final SAMFileHeader header,
                                                                                                       final String referenceFileName,
                                                                                                       final Broadcast<HaplotypeCallerArgumentCollection> hcArgsBroadcast,
                                                                                                       final Broadcast<AssemblyRegionArgumentCollection> assemblyRegionArgsBroadcast,
                                                                                                       final Broadcast<VariantAnnotatorEngine> annotatorEngineBroadcast) {
    return (FlatMapFunction<Iterator<AssemblyRegionWalkerContext>, VariantContext>) contexts -> {
        // HaplotypeCallerEngine isn't serializable but is expensive to instantiate, so construct and reuse one for every partition
        final ReferenceSequenceFile taskReferenceSequenceFile = taskReferenceSequenceFile(referenceFileName);
        final HaplotypeCallerEngine hcEngine = new HaplotypeCallerEngine(hcArgsBroadcast.value(), assemblyRegionArgsBroadcast.value(), false, false, header, taskReferenceSequenceFile, annotatorEngineBroadcast.getValue());
        Iterator<Iterator<VariantContext>> iterators = Utils.stream(contexts).map(context -> {
            AssemblyRegion region = context.getAssemblyRegion();
            FeatureContext featureContext = context.getFeatureContext();
            return hcEngine.callRegion(region, featureContext, context.getReferenceContext()).iterator();
        }).iterator();

        return Iterators.concat(iterators);
    };
}
 
Example #8
Source File: ALSUpdate.java    From oryx with Apache License 2.0 6 votes vote down vote up
private static RDD<Tuple2<Object,double[]>> readAndConvertFeatureRDD(
    JavaPairRDD<String,float[]> javaRDD,
    Broadcast<? extends Map<String,Integer>> bIdToIndex) {

  RDD<Tuple2<Integer,double[]>> scalaRDD = javaRDD.mapToPair(t ->
      new Tuple2<>(bIdToIndex.value().get(t._1()), t._2())
  ).mapValues(f -> {
      double[] d = new double[f.length];
      for (int i = 0; i < d.length; i++) {
        d[i] = f[i];
      }
      return d;
    }
  ).rdd();

  // This mimics the persistence level establish by ALS training methods
  scalaRDD.persist(StorageLevel.MEMORY_AND_DISK());

  @SuppressWarnings("unchecked")
  RDD<Tuple2<Object,double[]>> objKeyRDD = (RDD<Tuple2<Object,double[]>>) (RDD<?>) scalaRDD;
  return objKeyRDD;
}
 
Example #9
Source File: BroadCastParam.java    From sparkResearch with Apache License 2.0 6 votes vote down vote up
/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}
 
Example #10
Source File: TextPipelineTest.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
@Test
public void testWordFreqAccNotIdentifyingStopWords() throws Exception {

    JavaSparkContext sc = getContext();
    //  word2vec.setRemoveStop(false);
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());

    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    JavaRDD<List<String>> tokenizedRDD = pipeline.tokenize();
    pipeline.updateAndReturnAccumulatorVal(tokenizedRDD);

    Counter<String> wordFreqCounter = pipeline.getWordFreqAcc().value();
    assertEquals(wordFreqCounter.getCount("is"), 1, 0);
    assertEquals(wordFreqCounter.getCount("this"), 1, 0);
    assertEquals(wordFreqCounter.getCount("are"), 1, 0);
    assertEquals(wordFreqCounter.getCount("a"), 1, 0);
    assertEquals(wordFreqCounter.getCount("strange"), 2, 0);
    assertEquals(wordFreqCounter.getCount("flowers"), 1, 0);
    assertEquals(wordFreqCounter.getCount("world"), 1, 0);
    assertEquals(wordFreqCounter.getCount("red"), 1, 0);

    sc.stop();
}
 
Example #11
Source File: RemoteParForSpark.java    From systemds with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static Map<String, Broadcast<CacheBlock>> broadcastInputs(SparkExecutionContext sec, ArrayList<ParForStatementBlock.ResultVar> resultVars) {
	LocalVariableMap inputs = sec.getVariables();
	// exclude the result variables
	// TODO use optimizer-picked list of amenable objects (e.g., size constraints)
	Set<String> retVars = resultVars.stream()
		.map(v -> v._name).collect(Collectors.toSet());
	Set<String> brVars = inputs.keySet().stream()
		.filter(v -> !retVars.contains(v)).collect(Collectors.toSet());
	
	// construct broadcast objects
	Map<String, Broadcast<CacheBlock>> result = new HashMap<>();
	for (String key : brVars) {
		Data var = sec.getVariable(key);
		if ((var instanceof ScalarObject) || (var instanceof MatrixObject && ((MatrixObject) var).isPartitioned()))
			continue;
		result.put(key, sec.broadcastVariable((CacheableData<CacheBlock>) var));
	}
	return result;
}
 
Example #12
Source File: SparkWriteBuilder.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public StreamingWrite buildForStreaming() {
  // Validate
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema,
      checkNullability(spark, options), checkOrdering(spark, options));
  SparkUtil.validatePartitionTransforms(table.spec());

  // Change to streaming write if it is just append
  Preconditions.checkState(!overwriteDynamic,
      "Unsupported streaming operation: dynamic partition overwrite");
  Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(),
      "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr);

  // Get application id
  String appId = spark.sparkContext().applicationId();

  // Get write-audit-publish id
  String wapId = spark.conf().get("spark.wap.id", null);

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new SparkStreamingWrite(
      table, io, encryptionManager, options, overwriteByFilter, writeQueryId, appId, wapId, writeSchema, dsSchema);
}
 
Example #13
Source File: SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
protected void runTool(final JavaSparkContext ctx) {

    validateParams();

    final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast =
            StructuralVariationDiscoveryPipelineSpark.broadcastCNVCalls(ctx, getHeaderForReads(),
                    discoverStageArgs.cnvCallsFile);
    final String outputPrefixWithSampleName = getOutputPrefix();
    final SvDiscoveryInputMetaData svDiscoveryInputMetaData =
            new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName,
                    null, null, null,
                    cnvCallsBroadcast,
                    getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
    final JavaRDD<GATKRead> assemblyRawAlignments = getReads();

    final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes =
            preprocess(svDiscoveryInputMetaData, assemblyRawAlignments);

    final List<VariantContext> variants =
            dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, writeSAMFiles);
    contigsByPossibleRawTypes.unpersist();

    filterAndWriteMergedVCF(outputPrefixWithSampleName, variants, svDiscoveryInputMetaData);
}
 
Example #14
Source File: SparkBatchWrite.java    From iceberg with Apache License 2.0 6 votes vote down vote up
SparkBatchWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                CaseInsensitiveStringMap options, boolean overwriteDynamic, boolean overwriteByFilter,
                Expression overwriteExpr, String applicationId, String wapId, Schema writeSchema,
                StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.overwriteDynamic = overwriteDynamic;
  this.overwriteByFilter = overwriteByFilter;
  this.overwriteExpr = overwriteExpr;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.genieId = options.get("genie-id");
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}
 
Example #15
Source File: SvDiscoveryInputMetaData.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
public SvDiscoveryInputMetaData(final JavaSparkContext ctx,
                                final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs,
                                final String nonCanonicalChromosomeNamesFile,
                                final String outputPath,
                                final ReadMetadata readMetadata,
                                final List<SVInterval> assembledIntervals,
                                final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceTargetLinks,
                                final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast,
                                final SAMFileHeader headerForReads,
                                final ReferenceMultiSparkSource reference,
                                final Set<VCFHeaderLine> defaultToolVCFHeaderLines,
                                final Logger toolLogger) {

    final SAMSequenceDictionary sequenceDictionary = headerForReads.getSequenceDictionary();
    final Broadcast<Set<String>> canonicalChromosomesBroadcast =
            ctx.broadcast(SVUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
    final String sampleId = SVUtils.getSampleId(headerForReads);

    this.referenceData = new ReferenceData(canonicalChromosomesBroadcast, ctx.broadcast(reference), ctx.broadcast(sequenceDictionary));
    this.sampleSpecificData = new SampleSpecificData(sampleId, cnvCallsBroadcast, assembledIntervals, evidenceTargetLinks, readMetadata, ctx.broadcast(headerForReads));
    this.discoverStageArgs = discoverStageArgs;
    this.outputPath = outputPath;
    this.defaultToolVCFHeaderLines = defaultToolVCFHeaderLines;
    this.toolLogger = toolLogger;
}
 
Example #16
Source File: RemoveOrphanFilesAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private Dataset<Row> buildActualFileDF() {
  List<String> subDirs = Lists.newArrayList();
  List<String> matchingFiles = Lists.newArrayList();

  Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp;

  // list at most 3 levels and only dirs that have less than 10 direct sub dirs on the driver
  listDirRecursively(location, predicate, hadoopConf.value(), 3, 10, subDirs, matchingFiles);

  JavaRDD<String> matchingFileRDD = sparkContext.parallelize(matchingFiles, 1);

  if (subDirs.isEmpty()) {
    return spark.createDataset(matchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
  }

  int parallelism = Math.min(subDirs.size(), partitionDiscoveryParallelism);
  JavaRDD<String> subDirRDD = sparkContext.parallelize(subDirs, parallelism);

  Broadcast<SerializableConfiguration> conf = sparkContext.broadcast(hadoopConf);
  JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirsRecursively(conf, olderThanTimestamp));

  JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD);
  return spark.createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()).toDF("file_path");
}
 
Example #17
Source File: VariantWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
                .map(v -> {
                    final SimpleInterval variantInterval = new SimpleInterval(v);
                    return new VariantWalkerContext(v,
                            new ReadsContext(), // empty
                            new ReferenceContext(reference, variantInterval),
                            new FeatureContext(features, variantInterval));
                }).iterator();
    };
}
 
Example #18
Source File: SimpleNovelAdjacencyInterpreter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Filters input assembly contigs that are not strong enough to support an event,
 * then delegates to {@link BreakpointsInference} to infer the reference locations
 * that bound the bi-path bubble in the graph caused by the event,
 * as well as the alternative path encoded in the contig sequence.
 */
private static JavaPairRDD<SimpleNovelAdjacencyAndChimericAlignmentEvidence, List<SvType>>
inferTypeFromSingleContigSimpleChimera(final JavaRDD<AssemblyContigWithFineTunedAlignments> assemblyContigs,
                                       final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {

    final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast();
    final Broadcast<ReferenceMultiSparkSource> referenceBroadcast = svDiscoveryInputMetaData.getReferenceData().getReferenceBroadcast();

    return
            assemblyContigs
                    .filter(tig -> SimpleChimera
                            .splitPairStrongEnoughEvidenceForCA(tig.getHeadAlignment(), tig.getTailAlignment(),
                                    MORE_RELAXED_ALIGNMENT_MIN_MQ, MORE_RELAXED_ALIGNMENT_MIN_LENGTH))

                    .mapToPair(tig -> getNovelAdjacencyAndEvidence(tig, referenceSequenceDictionaryBroadcast.getValue()))

                    .groupByKey()       // group the same novel adjacency produced by different contigs together

                    .mapToPair(noveltyAndEvidence -> inferType(noveltyAndEvidence, referenceSequenceDictionaryBroadcast, referenceBroadcast));
}
 
Example #19
Source File: SparkBatchPortablePipelineTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Broadcast the side inputs of an executable stage. *This can be expensive.*
 *
 * @return Map from PCollection ID to Spark broadcast variable and coder to decode its contents.
 */
private static <SideInputT>
    ImmutableMap<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>>
        broadcastSideInputs(
            RunnerApi.ExecutableStagePayload stagePayload, SparkTranslationContext context) {
  Map<String, Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>>>
      broadcastVariables = new HashMap<>();
  for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
    RunnerApi.Components stagePayloadComponents = stagePayload.getComponents();
    String collectionId =
        stagePayloadComponents
            .getTransformsOrThrow(sideInputId.getTransformId())
            .getInputsOrThrow(sideInputId.getLocalName());
    if (broadcastVariables.containsKey(collectionId)) {
      // This PCollection has already been broadcast.
      continue;
    }
    Tuple2<Broadcast<List<byte[]>>, WindowedValueCoder<SideInputT>> tuple2 =
        broadcastSideInput(collectionId, stagePayloadComponents, context);
    broadcastVariables.put(collectionId, tuple2);
  }
  return ImmutableMap.copyOf(broadcastVariables);
}
 
Example #20
Source File: FindBreakpointEvidenceSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Grab template names for all reads that contain kmers associated with a given breakpoint.
 */
@VisibleForTesting static List<QNameAndInterval> getAssemblyQNames(
        final FindBreakpointEvidenceSparkArgumentCollection params,
        final JavaSparkContext ctx,
        final HopscotchUniqueMultiMap<SVKmer, Integer, KmerAndInterval> kmerMultiMap,
        final JavaRDD<GATKRead> unfilteredReads,
        final SVReadFilter filter ) {
    final Broadcast<HopscotchUniqueMultiMap<SVKmer, Integer, KmerAndInterval>> broadcastKmersAndIntervals =
            ctx.broadcast(kmerMultiMap);

    final int kSize = params.kSize;
    final List<QNameAndInterval> qNamesAndIntervals =
        unfilteredReads
            .filter(filter::notJunk)
            .filter(filter::isPrimaryLine)
            .mapPartitions(readItr ->
                    new FlatMapGluer<>(new QNameIntervalFinder(kSize,broadcastKmersAndIntervals.getValue()), readItr))
            .collect();

    SparkUtils.destroyBroadcast(broadcastKmersAndIntervals, "cleaned kmers and intervals");

    return qNamesAndIntervals;
}
 
Example #21
Source File: SparkMaster.java    From GeoTriples with Apache License 2.0 6 votes vote down vote up
/**
 * Convert the input Dataset into RDF triples and store the results.
 * The conversion is taking place per Partitions using the mapPartition Spark transformation.
 * @param mapping_list list of TripleMaps
 */
private void convert_partition(ArrayList<TriplesMap> mapping_list){
    SparkContext sc = SparkContext.getOrCreate();

    Pair<ArrayList<TriplesMap>, List<String>> transformation_info = new Pair<>(mapping_list, Arrays.asList(reader.getHeaders()));
    ClassTag<Pair<ArrayList<TriplesMap>, List<String>>> classTag_pair = scala.reflect.ClassTag$.MODULE$.apply(Pair.class);
    Broadcast<Pair<ArrayList<TriplesMap>, List<String>>> bd_info = sc.broadcast(transformation_info, classTag_pair);

    rowRDD
        .mapPartitions(
        (Iterator<Row> rows_iter) -> {
            ArrayList<TriplesMap> p_mapping_list = bd_info.value().getKey();
            List<String> p_header = bd_info.value().getValue();
            RML_Converter rml_converter = new RML_Converter(p_mapping_list, p_header);
            rml_converter.start();
            rml_converter.registerFunctions();
            Iterator<String> triples = rml_converter.convertPartition(rows_iter);

            rml_converter.stop();
            return triples;
        })
        .saveAsTextFile(outputDir);
}
 
Example #22
Source File: RewriteManifestsAction.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private List<ManifestFile> writeManifestsForPartitionedTable(
    Dataset<Row> manifestEntryDF, int numManifests,
    int targetNumManifestEntries) {

  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough
  long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries);

  return withReusableDS(manifestEntryDF, df -> {
    Column partitionColumn = df.col("data_file.partition");
    return df.repartitionByRange(numManifests, partitionColumn)
        .sortWithinPartitions(partitionColumn)
        .mapPartitions(
            toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
            manifestEncoder
        )
        .collectAsList();
  });
}
 
Example #23
Source File: HDFSWriter.java    From ViraPipe with MIT License 5 votes vote down vote up
public static JavaRDD<SAMRecord> setPartitionHeaders(final JavaRDD<SAMRecord> reads, final Broadcast<SAMFileHeader> header) {

        return reads.mapPartitions(records -> {
            //header.getValue().setTextHeader(header.getValue().getTextHeader()+"\\n@SQ\\tSN:"+records..getReferenceName());
            //record.setHeader(header);

            BAMHeaderOutputFormat.setHeader(header.getValue());
            return records;
        });
    }
 
Example #24
Source File: JavaRecoverableNetworkWordCount.java    From SparkDemo with MIT License 5 votes vote down vote up
public static Broadcast<List<String>> getInstance(JavaSparkContext jsc) {
  if (instance == null) {
    synchronized (JavaWordBlacklist.class) {
      if (instance == null) {
        List<String> wordBlacklist = Arrays.asList("a", "b", "c");
        instance = jsc.broadcast(wordBlacklist);
      }
    }
  }
  return instance;
}
 
Example #25
Source File: SimpleNovelAdjacencyInterpreter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static void evaluateNarls(final SvDiscoveryInputMetaData svDiscoveryInputMetaData,
                                  final List<NovelAdjacencyAndAltHaplotype> narls) {
    final Broadcast<SAMSequenceDictionary> referenceSequenceDictionaryBroadcast =
            svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast();
    final List<SVInterval> assembledIntervals = svDiscoveryInputMetaData.getSampleSpecificData().getAssembledIntervals();
    final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection
            discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs();
    final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger();
    SvDiscoveryUtils.evaluateIntervalsAndNarls(assembledIntervals, narls,
            referenceSequenceDictionaryBroadcast.getValue(), discoverStageArgs, toolLogger);
}
 
Example #26
Source File: DefaultSource.java    From flight-spark-source with Apache License 2.0 5 votes vote down vote up
public DataSourceReader createReader(DataSourceOptions dataSourceOptions) {
  Location defaultLocation = Location.forGrpcInsecure(
    dataSourceOptions.get("host").orElse("localhost"),
    dataSourceOptions.getInt("port", 47470)
  );
  String sql = dataSourceOptions.get("path").orElse("");
  FlightDataSourceReader.FactoryOptions options = new FlightDataSourceReader.FactoryOptions(
    defaultLocation,
    sql,
    dataSourceOptions.get("username").orElse("anonymous"),
    dataSourceOptions.get("password").orElse(null),
    dataSourceOptions.getBoolean("parallel", false), null);
  Broadcast<FlightDataSourceReader.FactoryOptions> bOptions = lazySparkContext().broadcast(options);
  return new FlightDataSourceReader(bOptions);
}
 
Example #27
Source File: Reader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString,
                 String nameMappingString, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                 boolean caseSensitive, boolean localityPreferred, ReaderFactory<T> readerFactory) {
  this.task = task;
  this.tableSchemaString = tableSchemaString;
  this.expectedSchemaString = expectedSchemaString;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.caseSensitive = caseSensitive;
  this.localityPreferred = localityPreferred;
  this.preferredLocations = getPreferredLocations();
  this.readerFactory = readerFactory;
  this.nameMappingString = nameMappingString;
}
 
Example #28
Source File: BroadcastObject.java    From systemds with Apache License 2.0 5 votes vote down vote up
private boolean checkPartitionedBroadcastValid() {
	//check for evicted soft reference
	PartitionedBroadcast<T> pbm = _pbcRef.get();
	if (pbm == null)
		return false;

	//check for validity of individual broadcasts
	Broadcast<PartitionedBlock<T>>[] tmp = pbm.getBroadcasts();
	for (Broadcast<PartitionedBlock<T>> bc : tmp)
		if (!bc.isValid())
			return false;
	return true;
}
 
Example #29
Source File: RDDSortUtils.java    From systemds with Apache License 2.0 5 votes vote down vote up
/**
 * This function collects and sorts value column in memory and then broadcasts it. 
 * 
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param blen block length
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen, 
		SparkExecutionContext sec, ReorgOperator r_op) 
{
	//collect orderby column for in-memory sorting
	MatrixBlock inMatBlock = SparkExecutionContext
		.toMatrixBlock(val, (int)rlen, 1, blen, -1);

	//in-memory sort operation (w/ index return: source index in target position)
	ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
	MatrixBlock sortedIx = inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
	
	//flip sort indices from <source ix in target pos> to <target ix in source pos>
	MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false); 
	for (int i=0; i < sortedIx.getNumRows(); i++) 
		sortedIxSrc.quickSetValue((int)sortedIx.quickGetValue(i,0)-1, 0, i+1);

	//broadcast index vector
	PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, blen);
	Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);

	//sort data with broadcast index vector
	JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data
			.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, _pmb));
	return RDDAggregateUtils.mergeRowsByKey(ret);
}
 
Example #30
Source File: RDDUtils.java    From geowave with Apache License 2.0 5 votes vote down vote up
public static void writeRasterToGeoWave(
    final SparkContext sc,
    final Index index,
    final DataStorePluginOptions outputStoreOptions,
    final RasterDataAdapter adapter,
    final JavaRDD<GridCoverage> inputRDD) throws IOException {

  // setup the configuration and the output format
  final Configuration conf = new org.apache.hadoop.conf.Configuration(sc.hadoopConfiguration());

  GeoWaveOutputFormat.setStoreOptions(conf, outputStoreOptions);
  GeoWaveOutputFormat.addIndex(conf, index);
  GeoWaveOutputFormat.addDataAdapter(conf, adapter);

  // create the job
  final Job job = new Job(conf);
  job.setOutputKeyClass(GeoWaveOutputKey.class);
  job.setOutputValueClass(GridCoverage.class);
  job.setOutputFormatClass(GeoWaveOutputFormat.class);

  // broadcast string names
  final ClassTag<String> stringTag = scala.reflect.ClassTag$.MODULE$.apply(String.class);
  final Broadcast<String> typeName = sc.broadcast(adapter.getTypeName(), stringTag);
  final Broadcast<String> indexName = sc.broadcast(index.getName(), stringTag);

  // map to a pair containing the output key and the output value
  inputRDD.mapToPair(
      gridCoverage -> new Tuple2<>(
          new GeoWaveOutputKey(typeName.value(), indexName.value()),
          gridCoverage)).saveAsNewAPIHadoopDataset(job.getConfiguration());
}