org.apache.spark.SparkFiles Java Examples

The following examples show how to use org.apache.spark.SparkFiles. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PdbToUniProt.java    From mmtf-spark with Apache License 2.0 6 votes vote down vote up
/**
 * Returns an up-to-date dataset of PDB to UniProt
 * chain-level mappings using SIFTS data.
 * 
 * <p> Example:
 * <pre>
 * +----------------+-----------+-------+---------+
 * |structureChainId|structureId|chainId|uniprotId|
 * +----------------+-----------+-------+---------+
 * |          1A02.F|       1A02|      F|   P01100|
 * |          1A02.J|       1A02|      J|   P05412|
 * |          1A02.N|       1A02|      N|   Q13469|
 * 
 * @return dataset of PDB to UniProt chain-level mappings
 * @throws IOException
 */
public static Dataset<Row> getChainMappings() throws IOException {
    
    SparkSession spark = SparkSession.builder().getOrCreate();
    spark.sparkContext().addFile(UNIPROT_MAPPING_URL);

    // parse csv file
    Dataset<Row> dataset = spark.read()
            .option("header", "true")
            .option("comment", "#")
            .option("inferSchema", "true")
            .csv(SparkFiles.get(UNIPROT_FILE));

    // cleanup and rename columns to be consistent with MMTF conventions.
    dataset = dataset.withColumn("PDB", upper(col("PDB")))
                     .withColumnRenamed("PDB","structureId")
                     .withColumnRenamed("CHAIN","chainId")
                     .withColumnRenamed("SP_PRIMARY","uniprotId")
                     .withColumn("structureChainId", concat_ws(".", col("structureId"), col("chainId")))
                     .select("structureChainId", "structureId", "chainId", "uniprotId");
    
    return dataset;
}
 
Example #2
Source File: LocusWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 * @param referenceFileName the name of the reference file added via {@code SparkContext#addFile()}
 * @param bFeatureManager the feature manager broadcast
 * @param sequenceDictionary the sequence dictionary for the reads
 * @param header the reads header
 * @param downsamplingInfo the downsampling method for the reads
 * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
 */
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager,
        SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo, boolean isEmitEmptyLoci) {
    return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
        SimpleInterval interval = shardedRead.getInterval();
        Iterator<GATKRead> readIterator = shardedRead.iterator();
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();

        final AlignmentContextIteratorBuilder alignmentContextIteratorBuilder = new AlignmentContextIteratorBuilder();
        alignmentContextIteratorBuilder.setDownsamplingInfo(downsamplingInfo);
        alignmentContextIteratorBuilder.setEmitEmptyLoci(isEmitEmptyLoci);
        alignmentContextIteratorBuilder.setKeepUniqueReadListInLibs(false);
        alignmentContextIteratorBuilder.setIncludeNs(false);

        final Iterator<AlignmentContext> alignmentContextIterator = alignmentContextIteratorBuilder.build(
                readIterator, header, Collections.singletonList(interval), sequenceDictionary, true);

        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(alignmentContextIterator, 0), false).map(alignmentContext -> {
            final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
            return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
        }).iterator();
    };
}
 
Example #3
Source File: VariantWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static FlatMapFunction<Shard<VariantContext>, VariantWalkerContext> getVariantsFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {
    return (FlatMapFunction<Shard<VariantContext>, VariantWalkerContext>) shard -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();

        return StreamSupport.stream(shard.spliterator(), false)
                .filter(v -> v.getStart() >= shard.getStart() && v.getStart() <= shard.getEnd()) // only include variants that start in the shard
                .map(v -> {
                    final SimpleInterval variantInterval = new SimpleInterval(v);
                    return new VariantWalkerContext(v,
                            new ReadsContext(), // empty
                            new ReferenceContext(reference, variantInterval),
                            new FeatureContext(features, variantInterval));
                }).iterator();
    };
}
 
Example #4
Source File: BaseRecalibratorSparkFn.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Run the {@link BaseRecalibrationEngine} on reads and overlapping variants.
 * @param readsWithVariants the RDD of reads with overlapping variants
 * @param header the reads header
 * @param referenceFileName the name of the reference file added via {@code SparkContext#addFile()}
 * @param recalArgs arguments to use during recalibration
 * @return the recalibration report object
 */
public static RecalibrationReport apply(final JavaPairRDD<GATKRead, Iterable<GATKVariant>> readsWithVariants, final SAMFileHeader header, final String referenceFileName, final RecalibrationArgumentCollection recalArgs) {
    JavaRDD<RecalibrationTables> unmergedTables = readsWithVariants.mapPartitions(readsWithVariantsIterator -> {
        String pathOnExecutor = SparkFiles.get(referenceFileName);
        ReferenceDataSource referenceDataSource = new ReferenceFileSource(IOUtils.getPath(pathOnExecutor));
        final BaseRecalibrationEngine bqsr = new BaseRecalibrationEngine(recalArgs, header);
        bqsr.logCovariatesUsed();
        Utils.stream(readsWithVariantsIterator).forEach(t -> bqsr.processRead(t._1, referenceDataSource, t._2));
        return Iterators.singletonIterator(bqsr.getRecalibrationTables());
    });

    final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(recalArgs, header));
    final RecalibrationTables combinedTables = unmergedTables.treeAggregate(emptyRecalibrationTable,
            RecalibrationTables::inPlaceCombine,
            RecalibrationTables::inPlaceCombine,
            Math.max(1, (int)(Math.log(unmergedTables.partitions().size()) / Math.log(2))));

    BaseRecalibrationEngine.finalizeRecalibrationTables(combinedTables);

    final QuantizationInfo quantizationInfo = new QuantizationInfo(combinedTables, recalArgs.QUANTIZING_LEVELS);

    final StandardCovariateList covariates = new StandardCovariateList(recalArgs, header);
    return RecalUtils.createRecalibrationReport(recalArgs.generateReportTable(covariates.covariateNames()), quantizationInfo.generateReportTable(), RecalUtils.generateReportTables(combinedTables, covariates));
}
 
Example #5
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegionWalkerContext> getAssemblyRegionsFunctionFast(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMFileHeader header,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> supplierBroadcast,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Iterator<Shard<GATKRead>>, AssemblyRegionWalkerContext>) shardedReadIterator -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        final AssemblyRegionEvaluator assemblyRegionEvaluator = supplierBroadcast.getValue().get(); // one AssemblyRegionEvaluator instance per Spark partition
        final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;

        Iterator<Iterator<AssemblyRegionWalkerContext>> iterators = Utils.stream(shardedReadIterator)
                .map(shardedRead -> new ShardToMultiIntervalShardAdapter<>(
                        new DownsampleableSparkReadShard(
                                new ShardBoundary(shardedRead.getInterval(), shardedRead.getPaddedInterval()), shardedRead, readsDownsampler)))
                .map(downsampledShardedRead -> {
                    final Iterator<AssemblyRegion> assemblyRegionIter = new AssemblyRegionIterator(
                            new ShardToMultiIntervalShardAdapter<>(downsampledShardedRead),
                            header, reference, features, assemblyRegionEvaluator, assemblyRegionArgs);
                    return Utils.stream(assemblyRegionIter).map(assemblyRegion ->
                            new AssemblyRegionWalkerContext(assemblyRegion,
                                    new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
                                    new FeatureContext(features, assemblyRegion.getPaddedSpan()))).iterator();
                }).iterator();
        return Iterators.concat(iterators);
    };
}
 
Example #6
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<Shard<GATKRead>>, ActivityProfileStateRange> getActivityProfileStatesFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager,
        final SAMFileHeader header,
        final Broadcast<Supplier<AssemblyRegionEvaluator>> supplierBroadcast,
        final AssemblyRegionArgumentCollection assemblyRegionArgs) {
    return (FlatMapFunction<Iterator<Shard<GATKRead>>, ActivityProfileStateRange>) shardedReadIterator -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        final AssemblyRegionEvaluator assemblyRegionEvaluator = supplierBroadcast.getValue().get(); // one AssemblyRegionEvaluator instance per Spark partition
        
        return Utils.stream(shardedReadIterator)
                .map(shardedRead -> {
                    final ReadsDownsampler readsDownsampler = assemblyRegionArgs.maxReadsPerAlignmentStart > 0 ?
                            new PositionalDownsampler(assemblyRegionArgs.maxReadsPerAlignmentStart, header) : null;
                    return new ShardToMultiIntervalShardAdapter<>(
                            new DownsampleableSparkReadShard(
                                    new ShardBoundary(shardedRead.getInterval(), shardedRead.getPaddedInterval()), shardedRead, readsDownsampler));
                })
                .map(shardedRead -> {
                    final Iterator<ActivityProfileState> activityProfileStateIter = new ActivityProfileStateIterator(
                            new ShardToMultiIntervalShardAdapter<>(shardedRead),
                            header, reference, features, assemblyRegionEvaluator
                    );
                    return new ActivityProfileStateRange(shardedRead, activityProfileStateIter);
                }).iterator();
    };
}
 
Example #7
Source File: FindAssemblyRegionsSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<AssemblyRegion>, AssemblyRegionWalkerContext> getAssemblyRegionWalkerContextFunction(
        final String referenceFileName,
        final Broadcast<FeatureManager> bFeatureManager) {

    return (FlatMapFunction<Iterator<AssemblyRegion>, AssemblyRegionWalkerContext>) assemblyRegionIter -> {
        final ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        final FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return Utils.stream(assemblyRegionIter).map(assemblyRegion ->
                new AssemblyRegionWalkerContext(assemblyRegion,
                        new ReferenceContext(reference, assemblyRegion.getPaddedSpan()),
                        new FeatureContext(features, assemblyRegion.getPaddedSpan()))).iterator();
    };
}
 
Example #8
Source File: ReadWalkerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private static FlatMapFunction<Iterator<GATKRead>, ReadWalkerContext> getReadsFunction(
        String referenceFileName, Broadcast<FeatureManager> bFeatureManager) {
    return readIterator -> {
        ReferenceDataSource reference = referenceFileName == null ? null : new ReferenceFileSource(IOUtils.getPath(SparkFiles.get(referenceFileName)));
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return Iterators.transform(readIterator, new Function<GATKRead, ReadWalkerContext>() {
            @Nullable
            @Override
            public ReadWalkerContext apply(@Nullable GATKRead r) {
                final SimpleInterval readInterval = getReadInterval(r);
                return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
            }
        });
    };
}
 
Example #9
Source File: JoinReadsWithVariants.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Joins each read of an RDD<GATKRead> with overlapping variants from a list of variants files.
 *
 * @param reads the RDD of reads, in coordinate-sorted order
 * @param variantsFileNames the names of the variants files added via {@code SparkContext#addFile()}
 * @return an RDD that contains each read along with the overlapping variants
 */
public static JavaPairRDD<GATKRead, Iterable<GATKVariant>> join(final JavaRDD<GATKRead> reads, final List<String> variantsFileNames) {
    return reads.mapPartitionsToPair((PairFlatMapFunction<Iterator<GATKRead>, GATKRead, Iterable<GATKVariant>>) gatkReadIterator -> {
        List<FeatureDataSource<VariantContext>> variantSources = variantsFileNames.stream().map(fileName -> openFeatureSource(SparkFiles.get(fileName))).collect(Collectors.toList());
        Iterator<Tuple2<GATKRead, Iterable<GATKVariant>>> iterator = Iterators.transform(gatkReadIterator, read -> getVariantsOverlappingRead(read, variantSources));
        return new CloseAtEndIterator<>(iterator, new AutoCloseableCollection(variantSources)); // close FeatureDataSource at end of iteration
    });
}
 
Example #10
Source File: BwaSparkEngine.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Performs read alignment on a RDD.
 * @param unalignedReads the reads to align.
 * @param pairedAlignment whether it should perform pair-end alignment ({@code true}) or single-end alignment ({@code false}).
 * @return never {@code null}.
 */
public JavaRDD<GATKRead> align(final JavaRDD<GATKRead> unalignedReads, final boolean pairedAlignment) {
    final Broadcast<SAMFileHeader> broadcastHeader = this.broadcastHeader;
    final String indexFileName = this.indexFileName;
    final boolean resolveIndexFileName = this.resolveIndexFileName;
    return unalignedReads.mapPartitions(itr ->
            new ReadAligner(resolveIndexFileName ? SparkFiles.get(indexFileName) : indexFileName, broadcastHeader.value(), pairedAlignment).apply(itr));
}
 
Example #11
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
protected Broadcast<Supplier<AssemblyRegionEvaluator>> assemblyRegionEvaluatorSupplierBroadcast(final JavaSparkContext ctx) {
    final Path referencePath = IOUtils.getPath(referenceArguments.getReferenceFileName());
    final String referenceFileName = referencePath.getFileName().toString();
    final String pathOnExecutor = SparkFiles.get(referenceFileName);
    final ReferenceSequenceFile taskReferenceSequenceFile = new CachingIndexedFastaSequenceFile(IOUtils.getPath(pathOnExecutor));
    final Collection<Annotation> annotations = makeVariantAnnotations();
    final VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(annotations,  hcArgs.dbsnp.dbsnp, hcArgs.comps, hcArgs.emitReferenceConfidence != ReferenceConfidenceMode.NONE, false);
    return assemblyRegionEvaluatorSupplierBroadcastFunction(ctx, hcArgs, assemblyRegionArgs, getHeaderForReads(), taskReferenceSequenceFile, annotatorEngine);
}
 
Example #12
Source File: Methods.java    From RP-DBSCAN with Apache License 2.0 4 votes vote down vote up
public void findCoreWithSpecificMeta(int readOrder, List<ApproximatedCell> grids) throws IOException, ClassNotFoundException
{
	float[] coords = new float[dim];
	List<Integer> neighborIdList = new ArrayList<Integer>();
	
	List<Integer> key = null;
	List<ApproximatedPoint> innerPts = null;
	int comp = (int)(Math.ceil(Math.sqrt(dim)));
		
	BufferedInputStream bi = new BufferedInputStream(new FileInputStream(new File(SparkFiles.get(metaPaths.get(readOrder)))));	
	GZIPInputStream gis = new GZIPInputStream(bi);
	ObjectInputStream ois = new ObjectInputStream(gis);
	Dictionary meta = (Dictionary)ois.readObject();
	ois.close();
	gis.close();
	bi.close();
		
	meta.buildNeighborSearchTree();
		
	for(ApproximatedCell grid : grids)
	{
		key = grid.cellCoords;
		innerPts = grid.pts;
			
		if( grid.ifFullCore || !meta.isContainCell(key))
			continue;

		//find neighbor cell from i th partition
		for(int j=0; j<dim; j++)
			coords[j] = (float)key.get(j);
					
		neighborIdList.clear();
		meta.neighborTree.getNeighborId(meta.neighborTree.root, coords, neighborIdList, comp);
			
		int state = 0;
		int cnt = 0;
			
		for(int j=0; j<neighborIdList.size(); j++)
		{						
			List<Integer> neighborCoords = meta.getIntGirdCoordsIndex(neighborIdList.get(j));
			Kdtree kdtree = meta.lvp_neighborTrees.get(neighborIdList.get(j));
			NeighborCell neighbor = new NeighborCell(neighborCoords, kdtree);

			for(ApproximatedPoint pt : innerPts)
			{
				if(pt.isCore)	continue;
				cnt = pt.neighborPts;

				//state check
				state = pt.stateWithSphere(neighbor.cellId, dim, sqr_r, meta.level_1_SideLen);
				if(state == 1)
					cnt += neighbor.lv_p_kdtree.count;
				else if(state == 0)
				{
					List<Kdnode> lv_p_neighbor = new ArrayList<Kdnode>();
					neighbor.lv_p_kdtree.getNeighborNode(neighbor.lv_p_kdtree.root, pt.coords, lv_p_neighbor, eps);
					for(Kdnode node: lv_p_neighbor)
					{
						if(Norm.sqr_L2_norm(pt.coords, node.coords) <= sqr_r)
							cnt += node.count;
						if(cnt >= minPts)
							break;
					}
				}
					
				pt.neighborPts = cnt;	
					
				if(cnt >= minPts)
					pt.isCore = true;
			}
		}
	}
	meta = null;
}
 
Example #13
Source File: Methods.java    From RP-DBSCAN with Apache License 2.0 4 votes vote down vote up
@Override
public Iterator<Tuple2<Integer, Edge>> call(Iterator<Tuple2<Long,ApproximatedCell>> args)
		throws Exception {
	// TODO Auto-generated method stub

	HashSet<Edge> edges = new HashSet<Edge>();
	List<ApproximatedCell> grids = new ArrayList<ApproximatedCell>();
	
	while(args.hasNext())
		grids.add(args.next()._2);
	
	//load meta directory info
	int metaSize = metaPaths.size();

	//set read order randomly
	HashSet<Integer> readOrders = new HashSet<Integer>();
	while(readOrders.size() < metaSize)
		readOrders.add((int)(Math.random()*metaSize));
	
	int ids = 1;
	for(Integer readOrder : readOrders)
	{
		System.out.println("DDR Meta ID : "+ (ids++));
		findDDRWithSpecificMeta(readOrder, edges, grids);
	}
	
	//------------------Edge Reduction 1 iteration
	HashSet<Long> mergedCoreCells = new HashSet<Long>();
	metaSize = corePaths.size();
	for(int i=0; i<metaSize; i++)
	{
		BufferedInputStream bi = new BufferedInputStream(new FileInputStream(new File(SparkFiles.get(corePaths.get(i)))));
		GZIPInputStream gis = new GZIPInputStream(bi);
		ObjectInputStream ois = new ObjectInputStream(gis);
		HashSet<Long> temp = (HashSet<Long>)ois.readObject();
		mergedCoreCells.addAll(temp);
		ois.close();
		gis.close();
		bi.close();
	}
	
	MinimumSpanningTree tree = new MinimumSpanningTree();
	return tree.reduceEdgesByMST(mergedCoreCells, edges, numOfPartition/2).iterator();
}
 
Example #14
Source File: Methods.java    From RP-DBSCAN with Apache License 2.0 4 votes vote down vote up
public void findDDRWithSpecificMeta(int readOrder, HashSet<Edge> edges, List<ApproximatedCell> grids) throws IOException, ClassNotFoundException
{
		BufferedInputStream bi = new BufferedInputStream(new FileInputStream(new File(SparkFiles.get(metaPaths.get(readOrder)))));	
		GZIPInputStream gis = new GZIPInputStream(bi);
		ObjectInputStream ois = new ObjectInputStream(gis);
		Dictionary meta = (Dictionary)ois.readObject();
		ois.close();
		gis.close();
		bi.close();
		meta.buildNeighborSearchTree();
		
		float[] coords = new float[dim];
		List<Integer> neighborIdList = new ArrayList<Integer>();

		List<Integer> targetId = null;
		Iterable<ApproximatedPoint> corePts = null;

		int comp = (int)(Math.ceil(Math.sqrt(dim)));
		
		for(ApproximatedCell grid : grids)
		{
			targetId = grid.cellCoords;
			corePts = grid.pts;
			
			if(!meta.isContainCell(targetId))
				continue;
			
			//tree for fast search
			coreTree.clear();
			int index = 0;
			for(ApproximatedPoint corePt : corePts)
				coreTree.insert(index, corePt.coords, 1);
			
			//find neighbor cell from i th partition
			for(int j=0; j<dim; j++)
				coords[j] = (float)targetId.get(j);
			
			neighborIdList.clear();
			meta.neighborTree.getNeighborId(meta.neighborTree.root, coords, neighborIdList, comp);

			for(int j=0; j<neighborIdList.size(); j++)
			{
				long neighborEncodedId = meta.getLv1CellEncodedId(neighborIdList.get(j));
				Edge edge = new Edge(grid.cellId, neighborEncodedId, ((float)Math.random()));							
				
				if(edges.contains(edge))
					continue;
				
				List<Integer> neighborId = meta.getIntGirdCoordsIndex(neighborIdList.get(j));
				Kdtree kdtree = meta.lvp_neighborTrees.get(neighborIdList.get(j));
				NeighborCell neighbor = new NeighborCell(neighborId, kdtree);
							
				List<Integer> edgeKey = new ArrayList<Integer>();
				edgeKey.addAll(targetId); edgeKey.addAll(neighborId);
										
				List<Integer> inverseKey = new ArrayList<Integer>();
				inverseKey.addAll(neighborId); inverseKey.addAll(targetId);
										
				Kdnode node = coreTree.cloestNode(neighbor.lv_p_kdtree.root.coords, sqr_r);
				Kdnode closest = null;
				if(node != null)
					closest = neighbor.lv_p_kdtree.cloestNode(node.coords,sqr_r);
				
				if(node == null || closest == null || Norm.sqr_L2_norm(node.coords, closest.coords) <= sqr_r)
						edges.add(edge);
				
			}
		}
		meta = null;
	}
 
Example #15
Source File: HaplotypeCallerSpark.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
private static ReferenceSequenceFile taskReferenceSequenceFile(final String referenceFileName) {
    final String pathOnExecutor = SparkFiles.get(referenceFileName);
    return new CachingIndexedFastaSequenceFile(IOUtils.getPath(pathOnExecutor));
}
 
Example #16
Source File: PdbToUniProt.java    From mmtf-spark with Apache License 2.0 3 votes vote down vote up
/**
 * Returns the current version of the cached dataset of PDB to UniProt 
 * residue mappings. This method is significantly faster, but may not
 * contain the mapping for recently released PDB entries.
 * 
 * @return dataset of PDB to UniProt residue mappings
 * @throws IOException
 */
public static Dataset<Row> getCachedResidueMappings() {       
    SparkSession spark = SparkSession.builder().getOrCreate();
    spark.conf().set("spark.sql.orc.impl", "native");
    spark.sparkContext().addFile(CACHED_FILE_URL);

    return spark.read().format("orc").load(SparkFiles.get(FILENAME));
}